chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,881 @@
/* Copyright (c) 2019, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include <ring-core/aes.h>
#include "../../internal.h"
// This file contains a constant-time implementation of AES, bitsliced with
// 32-bit or 64-bit, operating on two-, four-, and eight-block
// batches, respectively.
//
// This implementation is based on the algorithms described in the following
// references:
// - https://bearssl.org/constanttime.html#aes
// - https://eprint.iacr.org/2009/129.pdf
// - https://eprint.iacr.org/2009/191.pdf
// Word operations.
//
// An aes_word_t is the word used for this AES implementation. Throughout this
// file, bits and bytes are ordered little-endian, though "left" and "right"
// shifts match the operations themselves, which makes them reversed in a
// little-endian, left-to-right reading.
//
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
// bits each, each corresponding to a byte in an AES block in column-major
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
// contents of a logical byte will be described later.)
//
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
// value ranges from 0 to 15 independent of |aes_word_t| and
// |AES_NOHW_BATCH_SIZE|.
//
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
// uses row-major order. Matching the AES order was easier to reason about, and
// we do not have PSHUFB available to arbitrarily permute bytes.
#if defined(OPENSSL_64_BIT)
typedef uint64_t aes_word_t;
#define AES_NOHW_WORD_SIZE 8
#define AES_NOHW_BATCH_SIZE 4
#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f)
#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0)
#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00)
#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000)
#else // !OPENSSL_64_BIT
typedef uint32_t aes_word_t;
#define AES_NOHW_WORD_SIZE 4
#define AES_NOHW_BATCH_SIZE 2
#define AES_NOHW_ROW0_MASK 0x03030303
#define AES_NOHW_ROW1_MASK 0x0c0c0c0c
#define AES_NOHW_ROW2_MASK 0x30303030
#define AES_NOHW_ROW3_MASK 0xc0c0c0c0
#endif // OPENSSL_64_BIT
static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
return a & b;
}
static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
return a | b;
}
static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
return a ^ b;
}
static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }
static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
return a << (i * AES_NOHW_BATCH_SIZE);
}
static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
return a >> (i * AES_NOHW_BATCH_SIZE);
}
OPENSSL_STATIC_ASSERT(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t),
"batch size does not match word size");
OPENSSL_STATIC_ASSERT(AES_NOHW_WORD_SIZE == sizeof(aes_word_t),
"AES_NOHW_WORD_SIZE is incorrect");
// Block representations.
//
// This implementation uses three representations for AES blocks. First, the
// public API represents blocks as uint8_t[16] in the usual way. Second, most
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
// bars divide logical bytes):
//
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
// ...
//
// Finally, an individual block may be stored as an intermediate form in an
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
// block, so that block[0]'s ith logical byte contains least-significant
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
// "compacting" the block. Note this is no-op with 128-bit words because then
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
// words, one block would be stored in two words:
//
// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
//
// Observe that the distances between corresponding bits in bitsliced and
// compact bit orders match. If we line up corresponding words of each block,
// the bitsliced and compact representations may be converted by tranposing bits
// in corresponding logical bytes. Continuing the 64-bit example:
//
// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ...
// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ...
// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ...
// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ...
//
// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ...
// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ...
// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
//
// Note also that bitwise operations and (logical) byte permutations on an
// |aes_word_t| work equally for the bitsliced and compact words.
//
// We use the compact form in the |AES_KEY| representation to save work
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
// before or after |aes_nohw_transpose|.
#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t))
// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
// specified, it is in bitsliced form.
typedef struct {
aes_word_t w[8];
} AES_NOHW_BATCH;
// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
// |AES_KEY|s so it should not be used as a long-term key representation.
typedef struct {
// keys is an array of batches, one for each round key. Each batch stores
// |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form.
AES_NOHW_BATCH keys[AES_MAXNR + 1];
} AES_NOHW_SCHEDULE;
// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
// compact form.
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
const aes_word_t in[AES_NOHW_BLOCK_WORDS],
size_t i) {
// Note the words are interleaved. The order comes from |aes_nohw_transpose|.
// If |i| is zero and this is the 64-bit implementation, in[0] contains bits
// 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at
// w[4] so that bits 0 and 4 are in the correct position. (In general, bits
// along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares
// will be correctly placed.)
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
#if defined(OPENSSL_64_BIT)
batch->w[i] = in[0];
batch->w[i + 4] = in[1];
#else
batch->w[i] = in[0];
batch->w[i + 2] = in[1];
batch->w[i + 4] = in[2];
batch->w[i + 6] = in[3];
#endif
}
// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
// compact form.
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
aes_word_t out[AES_NOHW_BLOCK_WORDS],
size_t i) {
dev_assert_secret(i < AES_NOHW_BATCH_SIZE);
#if defined(OPENSSL_64_BIT)
out[0] = batch->w[i];
out[1] = batch->w[i + 4];
#else
out[0] = batch->w[i];
out[1] = batch->w[i + 2];
out[2] = batch->w[i + 4];
out[3] = batch->w[i + 6];
#endif
}
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
aes_word_t shift) {
// See
// https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
aes_word_t b = (a ^ (a >> shift)) & mask;
return a ^ b ^ (b << shift);
}
// In the 32-bit and 64-bit implementations, a block spans multiple words.
// |aes_nohw_compact_block| must permute bits across different words. First we
// implement |aes_nohw_compact_word| which performs a smaller version of the
// transformation which stays within a single word.
//
// These transformations are generalizations of the output of
// http://programming.sirrida.de/calcperm.php on smaller inputs.
#if defined(OPENSSL_64_BIT)
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap8(a);
#endif
// Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
// quartets of those chunks:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
// Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
// 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 =>
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
// Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
// 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 =>
// 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
return a;
}
static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap8(a);
#endif
return a;
}
#else // !OPENSSL_64_BIT
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap4(a);
#endif
// Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 =>
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15
// Note: 0x00cc = 0b0000_0000_1100_1100
// 0x00cc << 6 = 0b0011_0011_0000_0000
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
// Now we swap groups of four bits (still numbering by pairs):
// 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 =>
// 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15
// Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
return a;
}
static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
// Reverse the steps of |aes_nohw_uncompact_word|.
a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
#if defined(RING_BIG_ENDIAN)
a = CRYPTO_bswap4(a);
#endif
return a;
}
static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
uint8_t a2, uint8_t a3) {
return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
((uint32_t)a3 << 24);
}
static inline uint8_t lo(uint32_t a) {
return (uint8_t)a;
}
#endif // OPENSSL_64_BIT
static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
const uint8_t in[16]) {
OPENSSL_memcpy(out, in, 16);
#if defined(OPENSSL_64_BIT)
uint64_t a0 = aes_nohw_compact_word(out[0]);
uint64_t a1 = aes_nohw_compact_word(out[1]);
out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32);
out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32);
#else
uint32_t a0 = aes_nohw_compact_word(out[0]);
uint32_t a1 = aes_nohw_compact_word(out[1]);
uint32_t a2 = aes_nohw_compact_word(out[2]);
uint32_t a3 = aes_nohw_compact_word(out[3]);
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
out[0] = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
out[1] = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
out[2] = aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
out[3] = aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
#endif
}
static inline void aes_nohw_uncompact_block(
uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
#if defined(OPENSSL_64_BIT)
uint64_t a0 = in[0];
uint64_t a1 = in[1];
uint64_t b0 =
aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32));
uint64_t b1 =
aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32));
OPENSSL_memcpy(out, &b0, 8);
OPENSSL_memcpy(out + 8, &b1, 8);
#else
uint32_t a0 = in[0];
uint32_t a1 = in[1];
uint32_t a2 = in[2];
uint32_t a3 = in[3];
// Note clang, when building for ARM Thumb2, will sometimes miscompile
// expressions such as (a0 & 0x0000ff00) << 8, particularly when building
// without optimizations. This bug was introduced in
// https://reviews.llvm.org/rL340261 and fixed in
// https://reviews.llvm.org/rL351310. The following is written to avoid this.
uint32_t b0 = aes_nohw_word_from_bytes(lo(a0), lo(a1), lo(a2), lo(a3));
uint32_t b1 = aes_nohw_word_from_bytes(lo(a0 >> 8), lo(a1 >> 8), lo(a2 >> 8), lo(a3 >> 8));
uint32_t b2 =
aes_nohw_word_from_bytes(lo(a0 >> 16), lo(a1 >> 16), lo(a2 >> 16), lo(a3 >> 16));
uint32_t b3 =
aes_nohw_word_from_bytes(lo(a0 >> 24), lo(a1 >> 24), lo(a2 >> 24), lo(a3 >> 24));
b0 = aes_nohw_uncompact_word(b0);
b1 = aes_nohw_uncompact_word(b1);
b2 = aes_nohw_uncompact_word(b2);
b3 = aes_nohw_uncompact_word(b3);
OPENSSL_memcpy(out, &b0, 4);
OPENSSL_memcpy(out + 4, &b1, 4);
OPENSSL_memcpy(out + 8, &b2, 4);
OPENSSL_memcpy(out + 12, &b3, 4);
#endif
}
// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
// is repeated to the full width of |aes_word_t|.
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
uint32_t mask, aes_word_t shift) {
#if defined(OPENSSL_64_BIT)
aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
#else
aes_word_t mask_w = mask;
#endif
// This is a variation on a delta swap.
aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
*a ^= swap << shift;
*b ^= swap;
}
// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
// and transposes each square.
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) {
// Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101).
aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1);
aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1);
#if AES_NOHW_BATCH_SIZE >= 4
// Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011).
aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2);
aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2);
#endif
#if AES_NOHW_BATCH_SIZE >= 8
// Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111).
aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4);
aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4);
#endif
}
// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
size_t num_blocks) {
// Don't leave unused blocks uninitialized.
OPENSSL_memset(out, 0, sizeof(AES_NOHW_BATCH));
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
for (size_t i = 0; i < num_blocks; i++) {
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block, in + 16 * i);
aes_nohw_batch_set(out, block, i);
}
aes_nohw_transpose(out);
}
// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
const AES_NOHW_BATCH *batch) {
AES_NOHW_BATCH copy = *batch;
aes_nohw_transpose(&copy);
debug_assert_nonsecret(num_blocks <= AES_NOHW_BATCH_SIZE);
for (size_t i = 0; i < num_blocks; i++) {
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_batch_get(&copy, block, i);
aes_nohw_uncompact_block(out + 16 * i, block);
}
}
// AES round steps.
static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
const AES_NOHW_BATCH *key) {
for (size_t i = 0; i < 8; i++) {
batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]);
}
}
static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) {
// See https://eprint.iacr.org/2009/191.pdf, Appendix C.
aes_word_t x0 = batch->w[7];
aes_word_t x1 = batch->w[6];
aes_word_t x2 = batch->w[5];
aes_word_t x3 = batch->w[4];
aes_word_t x4 = batch->w[3];
aes_word_t x5 = batch->w[2];
aes_word_t x6 = batch->w[1];
aes_word_t x7 = batch->w[0];
// Figure 2, the top linear transformation.
aes_word_t y14 = aes_nohw_xor(x3, x5);
aes_word_t y13 = aes_nohw_xor(x0, x6);
aes_word_t y9 = aes_nohw_xor(x0, x3);
aes_word_t y8 = aes_nohw_xor(x0, x5);
aes_word_t t0 = aes_nohw_xor(x1, x2);
aes_word_t y1 = aes_nohw_xor(t0, x7);
aes_word_t y4 = aes_nohw_xor(y1, x3);
aes_word_t y12 = aes_nohw_xor(y13, y14);
aes_word_t y2 = aes_nohw_xor(y1, x0);
aes_word_t y5 = aes_nohw_xor(y1, x6);
aes_word_t y3 = aes_nohw_xor(y5, y8);
aes_word_t t1 = aes_nohw_xor(x4, y12);
aes_word_t y15 = aes_nohw_xor(t1, x5);
aes_word_t y20 = aes_nohw_xor(t1, x1);
aes_word_t y6 = aes_nohw_xor(y15, x7);
aes_word_t y10 = aes_nohw_xor(y15, t0);
aes_word_t y11 = aes_nohw_xor(y20, y9);
aes_word_t y7 = aes_nohw_xor(x7, y11);
aes_word_t y17 = aes_nohw_xor(y10, y11);
aes_word_t y19 = aes_nohw_xor(y10, y8);
aes_word_t y16 = aes_nohw_xor(t0, y11);
aes_word_t y21 = aes_nohw_xor(y13, y16);
aes_word_t y18 = aes_nohw_xor(x0, y16);
// Figure 3, the middle non-linear section.
aes_word_t t2 = aes_nohw_and(y12, y15);
aes_word_t t3 = aes_nohw_and(y3, y6);
aes_word_t t4 = aes_nohw_xor(t3, t2);
aes_word_t t5 = aes_nohw_and(y4, x7);
aes_word_t t6 = aes_nohw_xor(t5, t2);
aes_word_t t7 = aes_nohw_and(y13, y16);
aes_word_t t8 = aes_nohw_and(y5, y1);
aes_word_t t9 = aes_nohw_xor(t8, t7);
aes_word_t t10 = aes_nohw_and(y2, y7);
aes_word_t t11 = aes_nohw_xor(t10, t7);
aes_word_t t12 = aes_nohw_and(y9, y11);
aes_word_t t13 = aes_nohw_and(y14, y17);
aes_word_t t14 = aes_nohw_xor(t13, t12);
aes_word_t t15 = aes_nohw_and(y8, y10);
aes_word_t t16 = aes_nohw_xor(t15, t12);
aes_word_t t17 = aes_nohw_xor(t4, t14);
aes_word_t t18 = aes_nohw_xor(t6, t16);
aes_word_t t19 = aes_nohw_xor(t9, t14);
aes_word_t t20 = aes_nohw_xor(t11, t16);
aes_word_t t21 = aes_nohw_xor(t17, y20);
aes_word_t t22 = aes_nohw_xor(t18, y19);
aes_word_t t23 = aes_nohw_xor(t19, y21);
aes_word_t t24 = aes_nohw_xor(t20, y18);
aes_word_t t25 = aes_nohw_xor(t21, t22);
aes_word_t t26 = aes_nohw_and(t21, t23);
aes_word_t t27 = aes_nohw_xor(t24, t26);
aes_word_t t28 = aes_nohw_and(t25, t27);
aes_word_t t29 = aes_nohw_xor(t28, t22);
aes_word_t t30 = aes_nohw_xor(t23, t24);
aes_word_t t31 = aes_nohw_xor(t22, t26);
aes_word_t t32 = aes_nohw_and(t31, t30);
aes_word_t t33 = aes_nohw_xor(t32, t24);
aes_word_t t34 = aes_nohw_xor(t23, t33);
aes_word_t t35 = aes_nohw_xor(t27, t33);
aes_word_t t36 = aes_nohw_and(t24, t35);
aes_word_t t37 = aes_nohw_xor(t36, t34);
aes_word_t t38 = aes_nohw_xor(t27, t36);
aes_word_t t39 = aes_nohw_and(t29, t38);
aes_word_t t40 = aes_nohw_xor(t25, t39);
aes_word_t t41 = aes_nohw_xor(t40, t37);
aes_word_t t42 = aes_nohw_xor(t29, t33);
aes_word_t t43 = aes_nohw_xor(t29, t40);
aes_word_t t44 = aes_nohw_xor(t33, t37);
aes_word_t t45 = aes_nohw_xor(t42, t41);
aes_word_t z0 = aes_nohw_and(t44, y15);
aes_word_t z1 = aes_nohw_and(t37, y6);
aes_word_t z2 = aes_nohw_and(t33, x7);
aes_word_t z3 = aes_nohw_and(t43, y16);
aes_word_t z4 = aes_nohw_and(t40, y1);
aes_word_t z5 = aes_nohw_and(t29, y7);
aes_word_t z6 = aes_nohw_and(t42, y11);
aes_word_t z7 = aes_nohw_and(t45, y17);
aes_word_t z8 = aes_nohw_and(t41, y10);
aes_word_t z9 = aes_nohw_and(t44, y12);
aes_word_t z10 = aes_nohw_and(t37, y3);
aes_word_t z11 = aes_nohw_and(t33, y4);
aes_word_t z12 = aes_nohw_and(t43, y13);
aes_word_t z13 = aes_nohw_and(t40, y5);
aes_word_t z14 = aes_nohw_and(t29, y2);
aes_word_t z15 = aes_nohw_and(t42, y9);
aes_word_t z16 = aes_nohw_and(t45, y14);
aes_word_t z17 = aes_nohw_and(t41, y8);
// Figure 4, bottom linear transformation.
aes_word_t t46 = aes_nohw_xor(z15, z16);
aes_word_t t47 = aes_nohw_xor(z10, z11);
aes_word_t t48 = aes_nohw_xor(z5, z13);
aes_word_t t49 = aes_nohw_xor(z9, z10);
aes_word_t t50 = aes_nohw_xor(z2, z12);
aes_word_t t51 = aes_nohw_xor(z2, z5);
aes_word_t t52 = aes_nohw_xor(z7, z8);
aes_word_t t53 = aes_nohw_xor(z0, z3);
aes_word_t t54 = aes_nohw_xor(z6, z7);
aes_word_t t55 = aes_nohw_xor(z16, z17);
aes_word_t t56 = aes_nohw_xor(z12, t48);
aes_word_t t57 = aes_nohw_xor(t50, t53);
aes_word_t t58 = aes_nohw_xor(z4, t46);
aes_word_t t59 = aes_nohw_xor(z3, t54);
aes_word_t t60 = aes_nohw_xor(t46, t57);
aes_word_t t61 = aes_nohw_xor(z14, t57);
aes_word_t t62 = aes_nohw_xor(t52, t58);
aes_word_t t63 = aes_nohw_xor(t49, t58);
aes_word_t t64 = aes_nohw_xor(z4, t59);
aes_word_t t65 = aes_nohw_xor(t61, t62);
aes_word_t t66 = aes_nohw_xor(z1, t63);
aes_word_t s0 = aes_nohw_xor(t59, t63);
aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62));
aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60));
aes_word_t t67 = aes_nohw_xor(t64, t65);
aes_word_t s3 = aes_nohw_xor(t53, t66);
aes_word_t s4 = aes_nohw_xor(t51, t66);
aes_word_t s5 = aes_nohw_xor(t47, t65);
aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3));
aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67));
batch->w[0] = s7;
batch->w[1] = s6;
batch->w[2] = s5;
batch->w[3] = s4;
batch->w[4] = s3;
batch->w[5] = s2;
batch->w[6] = s1;
batch->w[7] = s0;
}
// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
// constant shift counts in the SSE2 implementation.
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \
(aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \
aes_nohw_shift_left((v), 16 - (n)*4)))
static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) {
for (size_t i = 0; i < 8; i++) {
aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK);
aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK);
aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK);
aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK);
row1 = aes_nohw_rotate_cols_right(row1, 1);
row2 = aes_nohw_rotate_cols_right(row2, 2);
row3 = aes_nohw_rotate_cols_right(row3, 3);
batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3));
}
}
// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
// down by one.
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) {
#if defined(OPENSSL_64_BIT)
return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) |
((v << 12) & UINT64_C(0xf000f000f000f000));
#else
return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0);
#endif
}
// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
// by two.
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) {
#if defined(OPENSSL_64_BIT)
return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) |
((v << 8) & UINT64_C(0xff00ff00ff00ff00));
#else
return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0);
#endif
}
static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) {
// See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A.
aes_word_t a0 = batch->w[0];
aes_word_t a1 = batch->w[1];
aes_word_t a2 = batch->w[2];
aes_word_t a3 = batch->w[3];
aes_word_t a4 = batch->w[4];
aes_word_t a5 = batch->w[5];
aes_word_t a6 = batch->w[6];
aes_word_t a7 = batch->w[7];
aes_word_t r0 = aes_nohw_rotate_rows_down(a0);
aes_word_t a0_r0 = aes_nohw_xor(a0, r0);
aes_word_t r1 = aes_nohw_rotate_rows_down(a1);
aes_word_t a1_r1 = aes_nohw_xor(a1, r1);
aes_word_t r2 = aes_nohw_rotate_rows_down(a2);
aes_word_t a2_r2 = aes_nohw_xor(a2, r2);
aes_word_t r3 = aes_nohw_rotate_rows_down(a3);
aes_word_t a3_r3 = aes_nohw_xor(a3, r3);
aes_word_t r4 = aes_nohw_rotate_rows_down(a4);
aes_word_t a4_r4 = aes_nohw_xor(a4, r4);
aes_word_t r5 = aes_nohw_rotate_rows_down(a5);
aes_word_t a5_r5 = aes_nohw_xor(a5, r5);
aes_word_t r6 = aes_nohw_rotate_rows_down(a6);
aes_word_t a6_r6 = aes_nohw_xor(a6, r6);
aes_word_t r7 = aes_nohw_rotate_rows_down(a7);
aes_word_t a7_r7 = aes_nohw_xor(a7, r7);
batch->w[0] =
aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0));
batch->w[1] =
aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7),
aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1)));
batch->w[2] =
aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2));
batch->w[3] =
aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7),
aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3)));
batch->w[4] =
aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7),
aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4)));
batch->w[5] =
aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5));
batch->w[6] =
aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6));
batch->w[7] =
aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7));
}
static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
size_t num_rounds, AES_NOHW_BATCH *batch) {
aes_nohw_add_round_key(batch, &key->keys[0]);
for (size_t i = 1; i < num_rounds; i++) {
aes_nohw_sub_bytes(batch);
aes_nohw_shift_rows(batch);
aes_nohw_mix_columns(batch);
aes_nohw_add_round_key(batch, &key->keys[i]);
}
aes_nohw_sub_bytes(batch);
aes_nohw_shift_rows(batch);
aes_nohw_add_round_key(batch, &key->keys[num_rounds]);
}
// Key schedule.
static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
const AES_KEY *key) {
for (size_t i = 0; i <= key->rounds; i++) {
// Copy the round key into each block in the batch.
for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) {
aes_word_t tmp[AES_NOHW_BLOCK_WORDS];
OPENSSL_memcpy(tmp, key->rd_key + 4 * i, 16);
aes_nohw_batch_set(&out->keys[i], tmp, j);
}
aes_nohw_transpose(&out->keys[i]);
}
}
static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10,
0x20, 0x40, 0x80, 0x1b, 0x36};
// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
// |rcon|, stored in a |aes_word_t|.
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) {
rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1);
return ((aes_word_t)rcon);
}
static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
const aes_word_t in[AES_NOHW_BLOCK_WORDS]) {
AES_NOHW_BATCH batch;
OPENSSL_memset(&batch, 0, sizeof(batch));
aes_nohw_batch_set(&batch, in, 0);
aes_nohw_transpose(&batch);
aes_nohw_sub_bytes(&batch);
aes_nohw_transpose(&batch);
aes_nohw_batch_get(&batch, out, 0);
}
static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) {
key->rounds = 10;
aes_word_t block[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block, in);
OPENSSL_memcpy(key->rd_key, block, 16);
for (size_t i = 1; i <= 10; i++) {
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
aes_nohw_sub_block(sub, block);
uint8_t rcon = aes_nohw_rcon[i - 1];
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate |rcon| and the transformed word into the first word.
block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j));
block[j] = aes_nohw_xor(
block[j],
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
// Propagate to the remaining words. Note this is reordered from the usual
// formulation to avoid needing masks.
aes_word_t v = block[j];
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4));
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8));
block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12));
}
OPENSSL_memcpy(key->rd_key + 4 * i, block, 16);
}
}
static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) {
key->rounds = 14;
// Each key schedule iteration produces two round keys.
aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS];
aes_nohw_compact_block(block1, in);
OPENSSL_memcpy(key->rd_key, block1, 16);
aes_nohw_compact_block(block2, in + 16);
OPENSSL_memcpy(key->rd_key + 4, block2, 16);
for (size_t i = 2; i <= 14; i += 2) {
aes_word_t sub[AES_NOHW_BLOCK_WORDS];
aes_nohw_sub_block(sub, block2);
uint8_t rcon = aes_nohw_rcon[i / 2 - 1];
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate |rcon| and the transformed word into the first word.
block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j));
block1[j] = aes_nohw_xor(
block1[j],
aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12));
// Propagate to the remaining words.
aes_word_t v = block1[j];
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4));
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8));
block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12));
}
OPENSSL_memcpy(key->rd_key + 4 * i, block1, 16);
if (i == 14) {
break;
}
aes_nohw_sub_block(sub, block1);
for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) {
// Incorporate the transformed word into the first word.
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12));
// Propagate to the remaining words.
aes_word_t v = block2[j];
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4));
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8));
block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12));
}
OPENSSL_memcpy(key->rd_key + 4 * (i + 1), block2, 16);
}
}
// External API.
int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
AES_KEY *aeskey) {
switch (bits) {
case 128:
aes_nohw_setup_key_128(aeskey, key);
return 0;
case 256:
aes_nohw_setup_key_256(aeskey, key);
return 0;
}
return 1;
}
void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) {
AES_NOHW_SCHEDULE sched;
aes_nohw_expand_round_keys(&sched, key);
AES_NOHW_BATCH batch;
aes_nohw_to_batch(&batch, in, /*num_blocks=*/1);
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
aes_nohw_from_batch(out, /*num_blocks=*/1, &batch);
}
static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
const uint8_t b[16]) {
for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) {
aes_word_t x, y;
OPENSSL_memcpy(&x, a + i, sizeof(aes_word_t));
OPENSSL_memcpy(&y, b + i, sizeof(aes_word_t));
x = aes_nohw_xor(x, y);
OPENSSL_memcpy(out + i, &x, sizeof(aes_word_t));
}
}
void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
size_t blocks, const AES_KEY *key,
const uint8_t ivec[16]) {
if (blocks == 0) {
return;
}
AES_NOHW_SCHEDULE sched;
aes_nohw_expand_round_keys(&sched, key);
// Make |AES_NOHW_BATCH_SIZE| copies of |ivec|.
alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16];
alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16];
for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
OPENSSL_memcpy(ivs + 16 * i, ivec, 16);
}
uint32_t ctr = CRYPTO_load_u32_be(ivs + 12);
for (;;) {
// Update counters.
for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) {
CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i);
}
size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks;
AES_NOHW_BATCH batch;
aes_nohw_to_batch(&batch, ivs, todo);
aes_nohw_encrypt_batch(&sched, key->rounds, &batch);
aes_nohw_from_batch(enc_ivs, todo, &batch);
for (size_t i = 0; i < todo; i++) {
aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i);
}
blocks -= todo;
if (blocks == 0) {
break;
}
in += 16 * AES_NOHW_BATCH_SIZE;
out += 16 * AES_NOHW_BATCH_SIZE;
ctr += AES_NOHW_BATCH_SIZE;
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,973 @@
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
# details].
#
# Performance.
#
# To start with see corresponding paragraph in aesni-x86_64.pl...
# Instead of filling table similar to one found there I've chosen to
# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
# The simplified table below represents 32-bit performance relative
# to 64-bit one in every given point. Ratios vary for different
# encryption modes, therefore interval values.
#
# 16-byte 64-byte 256-byte 1-KB 8-KB
# 53-67% 67-84% 91-94% 95-98% 97-99.5%
#
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
# January 2011
#
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
# interleaves at most 6 aes[enc|dec] instructions, because there are
# not enough registers for 8x interleave [which should be optimal for
# Sandy Bridge]. Actually, performance results for 6x interleave
# factor presented in aesni-x86_64.pl (except for CTR) are for this
# module.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
# November 2015
#
# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
# Skylake 2.68/0.65 0.65 0.66 0.64 0.66
# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
$AESNI_PREFIX="aes_hw";
$inline=1; # inline _aesni_[en|de]crypt
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0]);
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
&external_label("BORINGSSL_function_hit");
&preprocessor_endif();
&static_label("key_const");
if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; }
else { $movekey=\&movups; }
$len="eax";
$rounds="ecx";
$key="edx";
$inp="esi";
$out="edi";
$rounds_="ebx"; # backup copy for $rounds
$key_="ebp"; # backup copy for $key
$rndkey0="xmm0";
$rndkey1="xmm1";
$inout0="xmm2";
$inout1="xmm3";
$inout2="xmm4";
$inout3="xmm5"; $in1="xmm5";
$inout4="xmm6"; $in0="xmm6";
$inout5="xmm7"; $ivec="xmm7";
# AESNI extension
sub aeskeygenassist
{ my($dst,$src,$imm)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
}
sub aescommon
{ my($opcodelet,$dst,$src)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
}
sub aesimc { aescommon(0xdb,@_); }
sub aesenc { aescommon(0xdc,@_); }
sub aesenclast { aescommon(0xdd,@_); }
# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
$sn++;
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($ivec,$rndkey0) if (defined($ivec));
&lea ($key,&DWP(32,$key));
&xorps ($inout,$ivec) if (defined($ivec));
&xorps ($inout,$rndkey0) if (!defined($ivec));
&set_label("${p}1_loop_$sn");
eval"&aes${p} ($inout,$rndkey1)";
&dec ($rounds);
&$movekey ($rndkey1,&QWP(0,$key));
&lea ($key,&DWP(16,$key));
&jnz (&label("${p}1_loop_$sn"));
eval"&aes${p}last ($inout,$rndkey1)";
}}
sub aesni_generate1 # fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
&function_begin_B("_aesni_${p}rypt1");
&movups ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(0x10,$key));
&xorps ($inout,$rndkey0);
&$movekey ($rndkey0,&QWP(0x20,$key));
&lea ($key,&DWP(0x30,$key));
&cmp ($rounds,11);
&jb (&label("${p}128"));
&lea ($key,&DWP(0x40,$key));
# 192-bit key support was removed.
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x30,$key));
# 192-bit key support was removed.
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x10,$key));
&set_label("${p}128");
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x10,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x30,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x50,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x60,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x70,$key));
eval"&aes${p} ($inout,$rndkey1)";
eval"&aes${p}last ($inout,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt1");
}
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge, but it's unfeasible to accommodate such implementation
# in XMM registers addressable in 32-bit mode and therefore maximum
# of 6x is used instead...
sub aesni_generate2
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt2");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}2_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}2_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt2");
}
sub aesni_generate3
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt3");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}3_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}3_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt3");
}
# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement would be <20%,
# so it's not worth it...
sub aesni_generate4
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt4");
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&shl ($rounds,4);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&data_byte (0x0f,0x1f,0x40,0x00);
&add ($rounds,16);
&set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt4");
}
sub aesni_generate6
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt6");
&static_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0); # pxor does better here
&pxor ($inout2,$rndkey0);
eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout3,$rndkey0);
&pxor ($inout4,$rndkey0);
eval"&aes${p} ($inout1,$rndkey1)";
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
&add ($rounds,16);
&jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
&set_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
eval"&aes${p} ($inout4,$rndkey0)";
eval"&aes${p} ($inout5,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}6_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
eval"&aes${p}last ($inout4,$rndkey0)";
eval"&aes${p}last ($inout5,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt6");
}
&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
if ($PREFIX eq $AESNI_PREFIX) {
######################################################################
# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# stack layout:
# 0 pshufb mask
# 16 vector addend: 0,6,6,6
# 32 counter-less ivec
# 48 1st triplet of counter vector
# 64 2nd triplet of counter vector
# 80 saved %esp
&function_begin("${PREFIX}_ctr32_encrypt_blocks");
&record_function_hit(0);
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($key_,"esp");
&sub ("esp",88);
&and ("esp",-16); # align stack
&mov (&DWP(80,"esp"),$key_);
&cmp ($len,1);
&je (&label("ctr32_one_shortcut"));
&movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
&mov (&DWP(4,"esp"),0x08090a0b);
&mov (&DWP(8,"esp"),0x04050607);
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds,6);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds);
&mov (&DWP(20,"esp"),$rounds);
&mov (&DWP(24,"esp"),$rounds);
&mov (&DWP(28,"esp"),$key_);
&pextrd ($rounds_,$inout5,3); # pull 32-bit counter
&pinsrd ($inout5,$key_,3); # wipe 32-bit counter
&mov ($rounds,&DWP(240,$key)); # key->rounds
# compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
&pxor ($rndkey0,$rndkey0);
&pxor ($rndkey1,$rndkey1);
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
&pinsrd ($rndkey0,$rounds_,0);
&lea ($key_,&DWP(3,$rounds_));
&pinsrd ($rndkey1,$key_,0);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,1);
&inc ($key_);
&pinsrd ($rndkey1,$key_,1);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,2);
&inc ($key_);
&pinsrd ($rndkey1,$key_,2);
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&movdqu ($inout4,&QWP(0,$key)); # key[0]
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
&pshufd ($inout1,$rndkey0,2<<6);
&cmp ($len,6);
&jb (&label("ctr32_tail"));
&pxor ($inout5,$inout4); # counter-less ivec^key[0]
&shl ($rounds,4);
&mov ($rounds_,16);
&movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
&mov ($key_,$key); # backup $key
&sub ($rounds_,$rounds); # backup twisted $rounds
&lea ($key,&DWP(32,$key,$rounds));
&sub ($len,6);
&jmp (&label("ctr32_loop6"));
&set_label("ctr32_loop6",16);
# inlining _aesni_encrypt6's prologue gives ~6% improvement...
&pshufd ($inout2,$rndkey0,1<<6);
&movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
&pshufd ($inout3,$rndkey1,3<<6);
&pxor ($inout0,$rndkey0); # merge counter-less ivec
&pshufd ($inout4,$rndkey1,2<<6);
&pxor ($inout1,$rndkey0);
&pshufd ($inout5,$rndkey1,1<<6);
&$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&aesenc ($inout0,$rndkey1);
&pxor ($inout4,$rndkey0);
&pxor ($inout5,$rndkey0);
&aesenc ($inout1,$rndkey1);
&$movekey ($rndkey0,&QWP(32,$key_));
&mov ($rounds,$rounds_);
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
&xorps ($inout2,$rndkey1);
&movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&paddd ($rndkey1,$rndkey0); # 2nd triplet increment
&paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
&movups ($inout1,&QWP(0x30,$inp));
&movups ($inout2,&QWP(0x40,$inp));
&xorps ($inout3,$inout1);
&movups ($inout1,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&xorps ($inout4,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&xorps ($inout5,$inout1);
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&movups (&QWP(0x40,$out),$inout4);
&pshufd ($inout0,$rndkey0,3<<6);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&pshufd ($inout1,$rndkey0,2<<6);
&sub ($len,6);
&jnc (&label("ctr32_loop6"));
&add ($len,6);
&jz (&label("ctr32_ret"));
&movdqu ($inout5,&QWP(0,$key_));
&mov ($key,$key_);
&pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
&set_label("ctr32_tail");
&por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
&pshufd ($inout2,$rndkey0,1<<6);
&por ($inout1,$inout5);
&je (&label("ctr32_two"));
&pshufd ($inout3,$rndkey1,3<<6);
&por ($inout2,$inout5);
&cmp ($len,4);
&jb (&label("ctr32_three"));
&pshufd ($inout4,$rndkey1,2<<6);
&por ($inout3,$inout5);
&je (&label("ctr32_four"));
&por ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout2,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout3,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout4,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&movups ($in0,&QWP(0,$inp));
&xorps ($in0,$inout0);
&movups (&QWP(0,$out),$in0);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
&call ("_aesni_encrypt2");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_three",16);
&call ("_aesni_encrypt3");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&movups ($inout5,&QWP(0x20,$inp));
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$inout5);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_four",16);
&call ("_aesni_encrypt4");
&movups ($inout4,&QWP(0,$inp));
&movups ($inout5,&QWP(0x10,$inp));
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout0,$inout4);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout1,$inout5);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
&pxor ("xmm5","xmm5");
&movdqa (&QWP(48,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(64,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("${PREFIX}_ctr32_encrypt_blocks");
}
######################################################################
# Mechanical port from aesni-x86_64.pl.
# int $PREFIX_set_encrypt_key_base (const unsigned char *userKey, int bits,
# AES_KEY *key)
&function_begin_B("${PREFIX}_set_encrypt_key_base");
&record_function_hit(3);
&mov ("eax",&wparam(0));
&mov ($rounds,&wparam(1));
&mov ($key,&wparam(2));
&push ("ebx");
&call (&label("pic"));
&set_label("pic");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&lea ($key,&DWP(16,$key));
&cmp ($rounds,256);
&je (&label("14rounds"));
# 192-bit key support was removed.
&cmp ($rounds,128);
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
&call (&label("key_128_cold"));
&aeskeygenassist("xmm1","xmm0",0x2); # round 2
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 3
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 4
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 5
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 6
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x40); # round 7
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x80); # round 8
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x1b); # round 9
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x36); # round 10
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
&jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&set_label("key_128_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
&mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
&call (&label("key_256a_cold"));
&aeskeygenassist("xmm1","xmm0",0x01); # round 3
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 4
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x02); # round 5
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x04); # round 6
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 7
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x08); # round 8
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 9
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x10); # round 10
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 11
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x20); # round 12
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 13
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x40); # round 14
&call (&label("key_256a"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
&jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
&lea ($key,&DWP(16,$key));
&set_label("key_256a_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("key_256b",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&shufps ("xmm4","xmm2",0b00010000);
&xorps ("xmm2","xmm4");
&shufps ("xmm4","xmm2",0b10001100);
&xorps ("xmm2","xmm4");
&shufps ("xmm1","xmm1",0b10101010); # critical path
&xorps ("xmm2","xmm1");
&ret();
&set_label("good_key");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
&ret ();
&function_end_B("${PREFIX}_set_encrypt_key_base");
# int $PREFIX_set_encrypt_key_alt (const unsigned char *userKey, int bits,
# AES_KEY *key)
&function_begin_B("${PREFIX}_set_encrypt_key_alt");
&record_function_hit(3);
&mov ("eax",&wparam(0));
&mov ($rounds,&wparam(1));
&mov ($key,&wparam(2));
&push ("ebx");
&call (&label("pic"));
&set_label("pic");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&lea ($key,&DWP(16,$key));
&cmp ($rounds,256);
&je (&label("14rounds_alt"));
# 192-bit key support was removed.
&cmp ($rounds,128);
&jne (&label("bad_keybits"));
&set_label("10rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&mov ($rounds,8);
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&movdqa ("xmm2","xmm0");
&movdqu (&QWP(-16,$key),"xmm0");
&set_label("loop_key128");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(16,$key));
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(-16,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&dec ($rounds);
&jnz (&label("loop_key128"));
&movdqa ("xmm4",&QWP(0x30,"ebx"));
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(16,$key),"xmm0");
&mov ($rounds,9);
&mov (&DWP(96,$key),$rounds);
&jmp (&label("good_key"));
# 192-bit key support was removed.
&set_label("14rounds_alt",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
&movdqu (&QWP(-32,$key),"xmm0");
&movdqa ("xmm1","xmm2");
&movdqu (&QWP(-16,$key),"xmm2");
&set_label("loop_key256");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pslld ("xmm4",1);
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&dec ($rounds);
&jz (&label("done_key256"));
&pshufd ("xmm2","xmm0",0xff);
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
&movdqa ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm1","xmm3");
&pxor ("xmm2","xmm1");
&movdqu (&QWP(16,$key),"xmm2");
&lea ($key,&DWP(32,$key));
&movdqa ("xmm1","xmm2");
&jmp (&label("loop_key256"));
&set_label("done_key256");
&mov ($rounds,13);
&mov (&DWP(16,$key),$rounds);
&set_label("good_key");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
&ret ();
&function_end_B("${PREFIX}_set_encrypt_key_alt");
&set_label("key_const",64);
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
&data_word(1,1,1,1);
&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,587 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# This module implements support for ARMv8 AES instructions. The
# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
# registers to 16, which implies additional NEON load and integer
# instructions. This has no effect on mighty Apple A7, where results
# are literally equal to the theoretical estimates based on AES
# instruction latencies and issue rates. On Cortex-A53, an in-order
# execution core, this costs up to 10-15%, which is partially
# compensated by implementing dedicated code path for 128-bit
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
# Cortex-A53 1.32 1.29 1.46
# Cortex-A57(*) 1.95 0.85 0.93
# Denver 1.96 0.86 0.80
# Mongoose 1.33 1.20 1.20
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$prefix="aes_hw";
$code=<<___;
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
.code 32
#undef __thumb2__
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
# maintain both 32- and 64-bit codes within single module and
# transliterate common code to either flavour with regex vodoo.
#
{{{
my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
# On AArch64, put the data .rodata and use adrp + add for compatibility with
# execute-only memory. On AArch32, put it in .text and use adr.
$code.= ".section .rodata\n" if ($flavour =~ /64/);
$code.=<<___;
.align 5
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.text
.globl ${prefix}_set_encrypt_key
.type ${prefix}_set_encrypt_key,%function
.align 5
${prefix}_set_encrypt_key:
.Lenc_key:
___
$code.=<<___ if ($flavour =~ /64/);
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___;
mov $ptr,#-2
cmp $bits,#128
b.lt .Lenc_key_abort
cmp $bits,#256
b.gt .Lenc_key_abort
tst $bits,#0x3f
b.ne .Lenc_key_abort
___
$code.=<<___ if ($flavour =~ /64/);
adrp $ptr,:pg_hi21:.Lrcon
add $ptr,$ptr,:lo12:.Lrcon
___
$code.=<<___ if ($flavour !~ /64/);
adr $ptr,.Lrcon
___
$code.=<<___;
cmp $bits,#192
veor $zero,$zero,$zero
vld1.8 {$in0},[$inp],#16
mov $bits,#8 // reuse $bits
vld1.32 {$rcon,$mask},[$ptr],#32
b.lt .Loop128
// 192-bit key support was removed.
b .L256
.align 4
.Loop128:
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
b.ne .Loop128
vld1.32 {$rcon},[$ptr]
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vtbl.8 $key,{$in0},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in0},[$out],#16
aese $key,$zero
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
veor $in0,$in0,$key
vst1.32 {$in0},[$out]
add $out,$out,#0x50
mov $rounds,#10
b .Ldone
// 192-bit key support was removed.
.align 4
.L256:
vld1.8 {$in1},[$inp]
mov $bits,#7
mov $rounds,#14
vst1.32 {$in0},[$out],#16
.Loop256:
vtbl.8 $key,{$in1},$mask
vext.8 $tmp,$zero,$in0,#12
vst1.32 {$in1},[$out],#16
aese $key,$zero
subs $bits,$bits,#1
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in0,$in0,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $key,$key,$rcon
veor $in0,$in0,$tmp
vshl.u8 $rcon,$rcon,#1
veor $in0,$in0,$key
vst1.32 {$in0},[$out],#16
b.eq .Ldone
vdup.32 $key,${in0}[3] // just splat
vext.8 $tmp,$zero,$in1,#12
aese $key,$zero
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
vext.8 $tmp,$zero,$tmp,#12
veor $in1,$in1,$tmp
veor $in1,$in1,$key
b .Loop256
.Ldone:
str $rounds,[$out]
mov $ptr,#0
.Lenc_key_abort:
mov x0,$ptr // return value
`"ldr x29,[sp],#16" if ($flavour =~ /64/)`
ret
.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
___
}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
my ($rounds,$cnt,$key_)=("w5","w6","x7");
my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
$code.=<<___;
.globl ${prefix}_ctr32_encrypt_blocks
.type ${prefix}_ctr32_encrypt_blocks,%function
.align 5
${prefix}_ctr32_encrypt_blocks:
___
$code.=<<___ if ($flavour =~ /64/);
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
___
$code.=<<___ if ($flavour !~ /64/);
mov ip,sp
stmdb sp!,{r4-r10,lr}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
___
$code.=<<___;
ldr $rounds,[$key,#240]
ldr $ctr, [$ivp, #12]
vld1.32 {$dat0},[$ivp]
vld1.32 {q8-q9},[$key] // load key schedule...
sub $rounds,$rounds,#4
mov $step,#16
cmp $len,#2
add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
sub $rounds,$rounds,#2
vld1.32 {q12-q13},[$key_],#32
vld1.32 {q14-q15},[$key_],#32
vld1.32 {$rndlast},[$key_]
add $key_,$key,#32
mov $cnt,$rounds
cclr $step,lo
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
// affected by silicon errata #1742098 [0] and #1655431 [1],
// respectively, where the second instruction of an aese/aesmc
// instruction pair may execute twice if an interrupt is taken right
// after the first instruction consumes an input register of which a
// single 32-bit lane has been updated the last time it was modified.
//
// This function uses a counter in one 32-bit lane. The vmov.32 lines
// could write to $dat1 and $dat2 directly, but that trips this bugs.
// We write to $ivec and copy to the final register as a workaround.
//
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __ARMEB__
rev $ctr, $ctr
#endif
add $tctr1, $ctr, #1
vorr $ivec,$dat0,$dat0
rev $tctr1, $tctr1
vmov.32 ${ivec}[3],$tctr1
add $ctr, $ctr, #2
vorr $dat1,$ivec,$ivec
b.ls .Lctr32_tail
rev $tctr2, $ctr
vmov.32 ${ivec}[3],$tctr2
sub $len,$len,#3 // bias
vorr $dat2,$ivec,$ivec
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
aese $dat2,q9
aesmc $dat2,$dat2
vld1.32 {q9},[$key_],#16
b.gt .Loop3x_ctr32
aese $dat0,q8
aesmc $tmp0,$dat0
aese $dat1,q8
aesmc $tmp1,$dat1
vld1.8 {$in0},[$inp],#16
add $tctr0,$ctr,#1
aese $dat2,q8
aesmc $dat2,$dat2
vld1.8 {$in1},[$inp],#16
rev $tctr0,$tctr0
aese $tmp0,q9
aesmc $tmp0,$tmp0
aese $tmp1,q9
aesmc $tmp1,$tmp1
vld1.8 {$in2},[$inp],#16
mov $key_,$key
aese $dat2,q9
aesmc $tmp2,$dat2
aese $tmp0,q12
aesmc $tmp0,$tmp0
aese $tmp1,q12
aesmc $tmp1,$tmp1
veor $in0,$in0,$rndlast
add $tctr1,$ctr,#2
aese $tmp2,q12
aesmc $tmp2,$tmp2
veor $in1,$in1,$rndlast
add $ctr,$ctr,#3
aese $tmp0,q13
aesmc $tmp0,$tmp0
aese $tmp1,q13
aesmc $tmp1,$tmp1
// Note the logic to update $dat0, $dat1, and $dat1 is written to work
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
// 32-bit mode. See the comment above.
veor $in2,$in2,$rndlast
vmov.32 ${ivec}[3], $tctr0
aese $tmp2,q13
aesmc $tmp2,$tmp2
vorr $dat0,$ivec,$ivec
rev $tctr1,$tctr1
aese $tmp0,q14
aesmc $tmp0,$tmp0
vmov.32 ${ivec}[3], $tctr1
rev $tctr2,$ctr
aese $tmp1,q14
aesmc $tmp1,$tmp1
vorr $dat1,$ivec,$ivec
vmov.32 ${ivec}[3], $tctr2
aese $tmp2,q14
aesmc $tmp2,$tmp2
vorr $dat2,$ivec,$ivec
subs $len,$len,#3
aese $tmp0,q15
aese $tmp1,q15
aese $tmp2,q15
veor $in0,$in0,$tmp0
vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
vst1.8 {$in0},[$out],#16
veor $in1,$in1,$tmp1
mov $cnt,$rounds
vst1.8 {$in1},[$out],#16
veor $in2,$in2,$tmp2
vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vst1.8 {$in2},[$out],#16
b.hs .Loop3x_ctr32
adds $len,$len,#3
b.eq .Lctr32_done
cmp $len,#1
mov $step,#16
cclr $step,eq
.Lctr32_tail:
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
vld1.32 {q8},[$key_],#16
subs $cnt,$cnt,#2
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.32 {q9},[$key_],#16
b.gt .Lctr32_tail
aese $dat0,q8
aesmc $dat0,$dat0
aese $dat1,q8
aesmc $dat1,$dat1
aese $dat0,q9
aesmc $dat0,$dat0
aese $dat1,q9
aesmc $dat1,$dat1
vld1.8 {$in0},[$inp],$step
aese $dat0,q12
aesmc $dat0,$dat0
aese $dat1,q12
aesmc $dat1,$dat1
vld1.8 {$in1},[$inp]
aese $dat0,q13
aesmc $dat0,$dat0
aese $dat1,q13
aesmc $dat1,$dat1
veor $in0,$in0,$rndlast
aese $dat0,q14
aesmc $dat0,$dat0
aese $dat1,q14
aesmc $dat1,$dat1
veor $in1,$in1,$rndlast
aese $dat0,q15
aese $dat1,q15
cmp $len,#1
veor $in0,$in0,$dat0
veor $in1,$in1,$dat1
vst1.8 {$in0},[$out],#16
b.eq .Lctr32_done
vst1.8 {$in1},[$out]
.Lctr32_done:
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r10,pc}
___
$code.=<<___ if ($flavour =~ /64/);
ldr x29,[sp],#16
ret
___
$code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
$code.=<<___;
#endif
___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
};
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vext\.8/ext/o or
s/vrev32\.8/rev32/o or
s/vtst\.8/cmtst/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remaining legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
s/\.[ui]?32//o and s/\.16b/\.4s/go;
s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
# Switch preprocessor checks to aarch64 versions.
s/__ARME([BL])__/__AARCH64E$1__/go;
print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
local *unaes = sub {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<1) |(($2&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
};
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,303 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
# Snapdragon S4 - in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
# This file was patched in BoringSSL to remove the variable-time 4-bit
# implementation.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$code=<<___;
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
@ instructions are in aesv8-armx.pl.)
.arch armv7-a
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
vshr.u64 $Hlo,$IN#lo,#63
vshr.s8 $t1,#7 @ broadcast carry bit
vshl.i64 $IN,$IN,#1
vand $t0,$t0,$t1
vorr $IN#hi,$Hlo @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 $IN#hi,[$Xi]! @ load Xi
vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 $Xl#hi,[$Xi]! @ load Xi
vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN#hi,[$inp]! @ load inp
vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl#hi,$Xl#hi,$Xm#lo
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,#57 @ 1st phase
vshl.i64 $t2,$Xl,#62
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,#63
veor $t2, $t2, $t1 @
veor $Xl#hi,$Xl#hi,$t2#lo @
veor $Xh#lo,$Xh#lo,$t2#hi
vshr.u64 $t2,$Xl,#1 @ 2nd phase
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,#6
vshr.u64 $Xl,$Xl,#1 @
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,297 @@
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
# implements the multiplication algorithm described in:
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
#
# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
# NEON, the low and high halves of the 128-bit register q0 are accessible as
# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
# vN. Where the 32-bit version would use the upper half, this file must keep
# halves in separate registers.
#
# The other distinction is in syntax. 32-bit NEON embeds lane information in the
# instruction name, while AArch64 uses suffixes on the registers. For instance,
# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
#
# vshl.i64 q0, q0, #1
#
# in 64-bit, it would be written:
#
# shl v0.2d, v0.2d, #1
#
# See Programmer's Guide for ARMv8-A, section 7 for details.
# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
#
# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
# polynomial and is conditioned on the PMULL extension. This file emulates the
# latter with the former.
use strict;
my $flavour = shift;
my $output;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block
my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
# to spare.
my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
my ($k48_k32, $k16_k0) = map("v$_", (24..25));
my $code = "";
# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
sub clmul64x64 {
my ($r, $a, $b) = @_;
$code .= <<___;
ext $t0.8b, $a.8b, $a.8b, #1 // A1
pmull $t0.8h, $t0.8b, $b.8b // F = A1*B
ext $r.8b, $b.8b, $b.8b, #1 // B1
pmull $r.8h, $a.8b, $r.8b // E = A*B1
ext $t1.8b, $a.8b, $a.8b, #2 // A2
pmull $t1.8h, $t1.8b, $b.8b // H = A2*B
ext $t3.8b, $b.8b, $b.8b, #2 // B2
pmull $t3.8h, $a.8b, $t3.8b // G = A*B2
ext $t2.8b, $a.8b, $a.8b, #3 // A3
eor $t0.16b, $t0.16b, $r.16b // L = E + F
pmull $t2.8h, $t2.8b, $b.8b // J = A3*B
ext $r.8b, $b.8b, $b.8b, #3 // B3
eor $t1.16b, $t1.16b, $t3.16b // M = G + H
pmull $r.8h, $a.8b, $r.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)
// vand \$t0#hi, \$t0#hi, \$k48
// veor \$t0#lo, \$t0#lo, \$t0#hi
//
// veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)
// vand \$t1#hi, \$t1#hi, \$k32
// veor \$t1#lo, \$t1#lo, \$t1#hi
//
// veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)
// vand \$t2#hi, \$t2#hi, \$k16
// veor \$t2#lo, \$t2#lo, \$t2#hi
//
// veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 \$t3#hi, #0
//
// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext $t3.8b, $b.8b, $b.8b, #4 // B4
eor $t2.16b, $t2.16b, $r.16b // N = I + J
pmull $t3.8h, $a.8b, $t3.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 $t0l_t1l.2d, $t0.2d, $t1.2d
zip1 $t2l_t3l.2d, $t2.2d, $t3.2d
zip2 $t0h_t1h.2d, $t0.2d, $t1.2d
zip2 $t2h_t3h.2d, $t2.2d, $t3.2d
eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8
ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16
pmull $r.8h, $a.8b, $b.8b // D = A*B
ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32
ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24
eor $t0.16b, $t0.16b, $t1.16b
eor $t2.16b, $t2.16b, $t3.16b
eor $r.16b, $r.16b, $t0.16b
eor $r.16b, $r.16b, $t2.16b
___
}
$code .= <<___;
.text
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
AARCH64_VALID_CALL_TARGET
// This function is adapted from gcm_init_v8. xC2 is t3.
ld1 {$t1.2d}, [x1] // load H
movi $t3.16b, #0xe1
shl $t3.2d, $t3.2d, #57 // 0xc2.0
ext $INlo.16b, $t1.16b, $t1.16b, #8
ushr $t2.2d, $t3.2d, #63
dup $t1.4s, $t1.s[1]
ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01
ushr $t2.2d, $INlo.2d, #63
sshr $t1.4s, $t1.4s, #31 // broadcast carry bit
and $t2.16b, $t2.16b, $t0.16b
shl $INlo.2d, $INlo.2d, #1
ext $t2.16b, $t2.16b, $t2.16b, #8
and $t0.16b, $t0.16b, $t1.16b
orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1
eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H
st1 {$Hlo.2d}, [x0] // store Htable[0]
ret
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
AARCH64_VALID_CALL_TARGET
ld1 {$INlo.16b}, [$Xi] // load Xi
ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
ld1 {$Hhi.1d}, [$Htbl]
adrp x9, :pg_hi21:.Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
rev64 $INlo.16b, $INlo.16b // byteswap Xi
ext $INlo.16b, $INlo.16b, $INlo.16b, #8
eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
mov $len, #16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
AARCH64_VALID_CALL_TARGET
ld1 {$Xl.16b}, [$Xi] // load Xi
ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
ld1 {$Hhi.1d}, [$Htbl]
adrp x9, :pg_hi21:.Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
rev64 $Xl.16b, $Xl.16b // byteswap Xi
ext $Xl.16b, $Xl.16b, $Xl.16b, #8
eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
.Loop_neon:
ld1 {$INlo.16b}, [$inp], #16 // load inp
rev64 $INlo.16b, $INlo.16b // byteswap inp
ext $INlo.16b, $INlo.16b, $INlo.16b, #8
eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi
.Lgmult_neon:
// Split the input into $INlo and $INhi. (The upper halves are unused,
// so it is okay to leave them alone.)
ins $INhi.d[0], $INlo.d[1]
___
&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo
$code .= <<___;
eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing
___
&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi
$code .= <<___;
ext $t0.16b, $Xl.16b, $Xh.16b, #8
eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing
eor $Xm.16b, $Xm.16b, $Xh.16b
eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi
ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result
// This is a no-op due to the ins instruction below.
// ins $Xh.d[0], $Xm.d[1]
// equivalent of reduction_avx from ghash-x86_64.pl
shl $t1.2d, $Xl.2d, #57 // 1st phase
shl $t2.2d, $Xl.2d, #62
eor $t2.16b, $t2.16b, $t1.16b //
shl $t1.2d, $Xl.2d, #63
eor $t2.16b, $t2.16b, $t1.16b //
// Note Xm contains {Xl.d[1], Xh.d[0]}.
eor $t2.16b, $t2.16b, $Xm.16b
ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]
ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]
ushr $t2.2d, $Xl.2d, #1 // 2nd phase
eor $Xh.16b, $Xh.16b,$Xl.16b
eor $Xl.16b, $Xl.16b,$t2.16b //
ushr $t2.2d, $t2.2d, #6
ushr $Xl.2d, $Xl.2d, #1 //
eor $Xl.16b, $Xl.16b, $Xh.16b //
eor $Xl.16b, $Xl.16b, $t2.16b //
subs $len, $len, #16
bne .Loop_neon
rev64 $Xl.16b, $Xl.16b // byteswap Xi and write
ext $Xl.16b, $Xl.16b, $Xl.16b, #8
st1 {$Xl.16b}, [$Xi]
ret
.size gcm_ghash_neon,.-gcm_ghash_neon
.section .rodata
.align 4
.Lmasks:
.quad 0x0000ffffffffffff // k48
.quad 0x00000000ffffffff // k32
.quad 0x000000000000ffff // k16
.quad 0x0000000000000000 // k0
.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,460 @@
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# March, May, June 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
# code paths: vanilla x86 and vanilla SSE. Former will be executed on
# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
# Atom 105/105 16.8 53
# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
# which is actually more relevant, because assembler code is
# position-independent;
# (***) see comment in non-MMX routine for further details;
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
# in particular, see comment at the end of the file...
# May 2010
#
# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
# The question is how close is it to theoretical limit? The pclmulqdq
# instruction latency appears to be 14 cycles and there can't be more
# than 2 of them executing at any given time. This means that single
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
# pre- and post-processing. Then multiplication has to be followed by
# modulo-reduction. Given that aggregated reduction method [see
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
# white paper by Intel] allows you to perform reduction only once in
# a while we can assume that asymptotic performance can be estimated
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
# and Naggr is the aggregation factor.
#
# Before we proceed to this implementation let's have closer look at
# the best-performing code suggested by Intel in their white paper.
# By tracing inter-register dependencies Tmod is estimated as ~19
# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
# processed byte. As implied, this is quite optimistic estimate,
# because it does not account for Karatsuba pre- and post-processing,
# which for a single multiplication is ~5 cycles. Unfortunately Intel
# does not provide performance data for GHASH alone. But benchmarking
# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
# the result accounts even for pre-computing of degrees of the hash
# key H, but its portion is negligible at 16KB buffer size.
#
# Moving on to the implementation in question. Tmod is estimated as
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
# 2.16. How is it possible that measured performance is better than
# optimistic theoretical estimate? There is one thing Intel failed
# to recognize. By serializing GHASH with CTR in same subroutine
# former's performance is really limited to above (Tmul + Tmod/Naggr)
# equation. But if GHASH procedure is detached, the modulo-reduction
# can be interleaved with Naggr-1 multiplications at instruction level
# and under ideal conditions even disappear from the equation. So that
# optimistic theoretical estimate for this implementation is ...
# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
# and one of the two multiplications the performance should be between
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
# bank allows to interleave reduction and multiplication better.
#
# Does it make sense to increase Naggr? To start with it's virtually
# impossible in 32-bit mode, because of limited register bank
# capacity. Otherwise improvement has to be weighed against slower
# setup, as well as code size and complexity increase. As even
# optimistic estimate doesn't promise 30% performance improvement,
# there are currently no plans to increase Naggr.
#
# Special thanks to David Woodhouse for providing access to a
# Westmere-based system on behalf of Intel Open Source Technology Centre.
# January 2010
#
# Tweaked to optimize transitions between integer and FP operations
# on same XMM register, PCLMULQDQ subroutine was measured to process
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
# The minor regression on Westmere is outweighed by ~15% improvement
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
#####################################################################
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
# 32-bit mode and 1.89 in 64-bit.
# February 2013
#
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
# reduction_alg9. Resulting performance is 1.96 cycles per byte on
# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
# This file was patched in BoringSSL to remove the variable-time 4-bit
# implementation.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
$x86only=0;
$sse2=1;
if (!$x86only) {{{
if ($sse2) {{
######################################################################
# PCLMULQDQ version.
$Xip="eax";
$Htbl="edx";
$const="ecx";
$inp="esi";
$len="ebx";
($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
($Xn,$Xhn)=("xmm6","xmm7");
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
&pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
&pxor ($T2,$Hkey) if (!defined($HK));
$HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
&movdqa ($T2,$T1); #
&psrldq ($T1,8);
&pslldq ($T2,8); #
&pxor ($Xhi,$T1);
&pxor ($Xi,$T2); #
}
if (1) { # Algorithm 9 with <<1 twist.
# Reduction is shorter and uses only two
# temporary registers, which makes it better
# candidate for interleaving with 64x64
# multiplication. Pre-modulo-scheduled loop
# was found to be ~20% faster than Algorithm 5
# below. Algorithm 9 was therefore chosen for
# further optimization...
sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
&movdqa ($T2,$Xi); #
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
&psrlq ($Xi,1);
&pxor ($Xhi,$T2); #
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
&mov ($Htbl,&wparam(0));
&mov ($Xip,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Hkey,&QWP(0,$Xip));
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
# <<1 twist
&pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
&movdqa ($T1,$Hkey);
&psllq ($Hkey,1);
&pxor ($T3,$T3); #
&psrlq ($T1,63);
&pcmpgtd ($T3,$T2); # broadcast carry bit
&pslldq ($T1,8);
&por ($Hkey,$T1); # H<<=1
# magic reduction
&pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
&pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
# calculate H^2
&movdqa ($Xi,$Hkey);
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
&pshufd ($T1,$Hkey,0b01001110);
&pshufd ($T2,$Xi,0b01001110);
&pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
&pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
&palignr ($T2,$T1,8); # low part is H.lo^H.hi
&movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
&function_begin("gcm_ghash_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&mov ($inp,&wparam(2));
&mov ($len,&wparam(3));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&sub ($len,0x10);
&jz (&label("odd_tail"));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
&movdqa ($Xhn,$Xn);
&pxor ($T1,$Xn); #
&lea ($inp,&DWP(32,$inp)); # i+=2
&pclmulqdq ($Xn,$Hkey,0x00); #######
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&nop ();
&sub ($len,0x20);
&jbe (&label("even_tail"));
&jmp (&label("mod_loop"));
&set_label("mod_loop",32);
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&nop ();
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xhi,$Xhn);
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pxor ($T1,$Xhi); #
&pshufb ($Xhn,$T3);
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&pshufb ($Xn,$T3);
&pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
&movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
&movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
&movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
&movdqa ($T2,$Xi); # 2nd phase
&psrlq ($Xi,1);
&pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&xorps ($Xhi,$Xhn);
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&pxor ($T1,$Xhi); #
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
&test ($len,$len);
&jnz (&label("done"));
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
&set_label("done");
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("gcm_ghash_clmul");
}
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
}} # $sse2
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";
# A question was risen about choice of vanilla MMX. Or rather why wasn't
# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
# CPUs such as PIII, "4-bit" MMX version was observed to provide better
# performance than *corresponding* SSE2 one even on contemporary CPUs.
# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
# implementation featuring full range of lookup-table sizes, but with
# per-invocation lookup table setup. Latter means that table size is
# chosen depending on how much data is to be hashed in every given call,
# more data - larger table. Best reported result for Core2 is ~4 cycles
# per processed byte out of 64KB block. This number accounts even for
# 64KB table setup overhead. As discussed in gcm128.c we choose to be
# more conservative in respect to lookup table sizes, but how do the
# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
# on same platform. As also discussed in gcm128.c, next in line "8-bit
# Shoup's" or "4KB" method should deliver twice the performance of
# "256B" one, in other words not worse than ~6 cycles per byte. It
# should be also be noted that in SSE2 case improvement can be "super-
# linear," i.e. more than twice, mostly because >>8 maps to single
# instruction on SSE2 register. This is unlike "4-bit" case when >>4
# maps to same amount of instructions in both MMX and SSE2 cases.
# Bottom line is that switch to SSE2 is considered to be justifiable
# only in case we choose to implement "8-bit" method...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,334 @@
#! /usr/bin/env perl
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
#
# Initial version was developed in tight cooperation with Ard
# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
# Just like aesv8-armx.pl this module supports both AArch32 and
# AArch64 execution modes.
#
# July 2014
#
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# November 2017
#
# AArch64 register bank to "accommodate" 4x aggregated reduction and
# improve performance by 20-70% depending on processor.
#
# Current performance in cycles per processed byte:
#
# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
# Apple A7 0.58 0.92 5.62
# Cortex-A53 0.85 1.01 8.39
# Cortex-A57 0.73 1.17 7.61
# Denver 0.51 0.65 6.02
# Mongoose 0.65 1.10 8.06
# Kryo 0.76 1.16 8.00
#
# (*) presented for reference/comparison purposes;
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
$code=<<___;
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
.code 32
#undef __thumb2__
___
################################################################################
# void gcm_init_clmul(u128 Htable[16],const u64 H[2]);
#
# input: 128-bit H - secret parameter E(K,0^128)
# output: precomputed table filled with degrees of twisted H;
# H is twisted to handle reverse bitness of GHASH;
# only few of 16 slots of Htable[16] are used;
# data is opaque to outside world (which allows to
# optimize the code independently);
#
$code.=<<___;
.global gcm_init_clmul
.type gcm_init_clmul,%function
.align 4
gcm_init_clmul:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[x1] @ load input H
vmov.i8 $xC2,#0xe1
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
vext.8 $IN,$t1,$t1,#8
vshr.u64 $t2,$xC2,#63
vdup.32 $t1,${t1}[1]
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
vshr.u64 $t2,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t2,$t2,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t2,$t2,$t2,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t2 @ H<<<=1
veor $H,$IN,$t0 @ twisted H
vst1.64 {$H},[x0],#16 @ store Htable[0]
@ calculate H^2
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
vpmull.p64 $Xl,$H,$H
veor $t0,$t0,$H
vpmull2.p64 $Xh,$H,$H
vpmull.p64 $Xm,$t0,$t0
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $H2,$Xl,$t2
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
___
if ($flavour =~ /64/) {
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
$code.=<<___;
@ calculate H^3 and H^4
vpmull.p64 $Xl,$H, $H2
vpmull.p64 $Yl,$H2,$H2
vpmull2.p64 $Xh,$H, $H2
vpmull2.p64 $Yh,$H2,$H2
vpmull.p64 $Xm,$t0,$t1
vpmull.p64 $Ym,$t1,$t1
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $H, $Xl,$t2 @ H^3
veor $H2,$Yl,$t3 @ H^4
vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
vext.8 $t1,$H2,$H2,#8
veor $t0,$t0,$H
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
___
}
$code.=<<___;
ret
.size gcm_init_clmul,.-gcm_init_clmul
___
################################################################################
# void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
#
# input: Xi - current hash value;
# Htable - table precomputed in gcm_init_clmul;
# output: Xi - next hash value Xi;
#
$code.=<<___;
.global gcm_gmult_clmul
.type gcm_gmult_clmul,%function
.align 4
gcm_gmult_clmul:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $xC2,#0xe1
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
vshl.u64 $xC2,$xC2,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_gmult_clmul,.-gcm_gmult_clmul
___
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remaining legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
# Switch preprocessor checks to aarch64 versions.
s/__ARME([BL])__/__AARCH64E$1__/go;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,884 @@
#! /usr/bin/env perl
# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
##
######################################################################
# Adapted from the original x86_64 version and <appro@openssl.org>'s ARMv8
# version.
#
# armv7, aarch64, and x86_64 differ in several ways:
#
# * x86_64 SSSE3 instructions are two-address (destination operand is also a
# source), while NEON is three-address (destination operand is separate from
# two sources).
#
# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16.
#
# * x86_64 instructions can take memory references, while ARM is a load/store
# architecture. This means we sometimes need a spare register.
#
# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb),
# while armv7 only has a 64-bit byte shuffle (vtbl).
#
# This means this armv7 version must be a mix of both aarch64 and x86_64
# implementations. armv7 and aarch64 have analogous SIMD instructions, so we
# base the instructions on aarch64. However, we cannot use aarch64's register
# allocation. x86_64's register count matches, but x86_64 is two-address.
# vpaes-armv8.pl already accounts for this in the comments, which use
# three-address AVX instructions instead of the original SSSE3 ones. We base
# register usage on these comments, which are preserved in this file.
#
# This means we do not use separate input and output registers as in aarch64 and
# cannot pin as many constants in the preheat functions. However, the load/store
# architecture means we must still deviate from x86_64 in places.
#
# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source
# and destination and 128-bit table. Fortunately, armv7 also allows addressing
# upper and lower halves of each 128-bit register. The lower half of q{N} is
# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent
# instruction,
#
# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0.
#
# we write:
#
# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0.
# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1.
#
# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and
# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note,
# however, that destination (q0) and table (q1) registers may no longer match.
# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
# two-address pshufb always matched these operands, so this is common.)
#
# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
# expands to an ADD or SUB of the pc register to find an address. That immediate
# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
# This means larger values must be more aligned.
#
# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
# use either encoding (do we actually need to support this?). In ARM mode, the
# distances get large enough to require 16-byte alignment. Moving constants
# closer to their use resolves most of this, but common constants in
# _vpaes_consts are used by the whole file. Affected ADR instructions must be
# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
# constraint have been commented.
#
# For details on ARM's immediate value encoding scheme, see
# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
#
# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
#
# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
#
# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones).
# aarch64 names registers like v0, and denotes half-width operations in an
# instruction suffix (see below).
#
# * aarch64 embeds size and lane information in register suffixes. v0.16b is
# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s.
# armv7 embeds the total size in the register name (see above) and the size of
# each element in an instruction suffix, which may look like vmov.i8,
# vshr.u8, or vtbl.8, depending on instruction.
use strict;
my $flavour = shift;
my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir=$1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my $code = "";
$code.=<<___;
.syntax unified
.arch armv7-a
.fpu neon
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
.text
.type _vpaes_consts,%object
.align 7 @ totally strategic alignment
_vpaes_consts:
.Lk_mc_forward: @ mc_forward
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
.Lk_mc_backward:@ mc_backward
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
.Lk_sr: @ sr
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
@
@ "Hot" constants
@
.Lk_inv: @ inv, inva
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
.Lk_ipt: @ input transform (lo, hi)
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_sbo: @ sbou, sbot
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
.Lk_sb1: @ sb1u, sb1t
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
.Lk_sb2: @ sb2u, sb2t
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
.size _vpaes_consts,.-_vpaes_consts
.align 6
___
{
my ($inp,$out,$key) = map("r$_", (0..2));
my ($invlo,$invhi) = map("q$_", (10..11));
my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15));
$code.=<<___;
@@
@@ _aes_preheat
@@
@@ Fills q9-q15 as specified below.
@@
.type _vpaes_preheat,%function
.align 4
_vpaes_preheat:
adr r10, .Lk_inv
vmov.i8 q9, #0x0f @ .Lk_s0F
vld1.64 {q10,q11}, [r10]! @ .Lk_inv
add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo
vld1.64 {q12,q13}, [r10]! @ .Lk_sb1
vld1.64 {q14,q15}, [r10] @ .Lk_sb2
bx lr
@@
@@ _aes_encrypt_core
@@
@@ AES-encrypt q0.
@@
@@ Inputs:
@@ q0 = input
@@ q9-q15 as in _vpaes_preheat
@@ [$key] = scheduled keys
@@
@@ Output in q0
@@ Clobbers q1-q5, r8-r11
@@ Preserves q6-q8 so you get some local vectors
@@
@@
.type _vpaes_encrypt_core,%function
.align 4
_vpaes_encrypt_core:
mov r9, $key
ldr r8, [$key,#240] @ pull rounds
adr r11, .Lk_ipt
@ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
@ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
vld1.64 {q2, q3}, [r11]
adr r11, .Lk_mc_forward+16
vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0
vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1
vtbl.8 q1#hi, {q2}, q1#hi
vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2
vtbl.8 q2#hi, {q3}, q0#hi
veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
@ .Lenc_entry ends with a bnz instruction which is normally paired with
@ subs in .Lenc_loop.
tst r8, r8
b .Lenc_entry
.align 4
.Lenc_loop:
@ middle of middle round
add r10, r11, #0x40
vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
vtbl.8 q4#hi, {$sb1t}, q2#hi
vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
vtbl.8 q0#hi, {$sb1u}, q3#hi
veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
vtbl.8 q5#hi, {$sb2t}, q2#hi
veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
vtbl.8 q2#hi, {$sb2u}, q3#hi
vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
vtbl.8 q3#hi, {q0}, q1#hi
veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
@ Write to q5 instead of q0, so the table and destination registers do
@ not overlap.
vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
vtbl.8 q5#hi, {q0}, q4#hi
veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
vtbl.8 q4#hi, {q3}, q1#hi
@ Here we restore the original q0/q5 usage.
veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4
veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
subs r8, r8, #1 @ nr--
.Lenc_entry:
@ top of round
vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i
vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
vtbl.8 q5#hi, {$invhi}, q1#hi
veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
vtbl.8 q3#hi, {$invlo}, q0#hi
vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
vtbl.8 q4#hi, {$invlo}, q1#hi
veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
vtbl.8 q2#hi, {$invlo}, q3#hi
vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
vtbl.8 q3#hi, {$invlo}, q4#hi
veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
bne .Lenc_loop
@ middle of last round
add r10, r11, #0x80
adr r11, .Lk_sbo
@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
@ overlap table and destination registers.
vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
vtbl.8 q4#hi, {q1}, q2#hi
vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
@ Write to q2 instead of q0 below, to avoid overlapping table and
@ destination registers.
vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
vtbl.8 q2#hi, {q0}, q3#hi
veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
@ Here we restore the original q0/q2 usage.
vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0
vtbl.8 q0#hi, {q2}, q1#hi
bx lr
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
___
}
{
my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3");
my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12));
$code.=<<___;
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@ @@
@@ AES key schedule @@
@@ @@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ This function diverges from both x86_64 and armv7 in which constants are
@ pinned. x86_64 has a common preheat function for all operations. aarch64
@ separates them because it has enough registers to pin nearly all constants.
@ armv7 does not have enough registers, but needing explicit loads and stores
@ also complicates using x86_64's register allocation directly.
@
@ We pin some constants for convenience and leave q14 and q15 free to load
@ others on demand.
@
@ Key schedule constants
@
.type _vpaes_key_consts,%object
.align 4
_vpaes_key_consts:
.Lk_rcon: @ rcon
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
.Lk_opt: @ output transform
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
.Lk_deskew: @ deskew tables: inverts the sbox's "skew"
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
.size _vpaes_key_consts,.-_vpaes_key_consts
.type _vpaes_key_preheat,%function
.align 4
_vpaes_key_preheat:
adr r11, .Lk_rcon
vmov.i8 $s63, #0x5b @ .Lk_s63
adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
vmov.i8 $s0F, #0x0f @ .Lk_s0F
vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv
vld1.64 {$rcon}, [r11] @ .Lk_rcon
bx lr
.size _vpaes_key_preheat,.-_vpaes_key_preheat
.type _vpaes_schedule_core,%function
.align 4
_vpaes_schedule_core:
@ We only need to save lr, but ARM requires an 8-byte stack alignment,
@ so save an extra register.
stmdb sp!, {r3,lr}
bl _vpaes_key_preheat @ load the tables
adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
@ input transform
@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
@ overlap table and destination.
vmov q4, q0 @ vmovdqa %xmm0, %xmm3
bl _vpaes_schedule_transform
adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
vmov q7, q0 @ vmovdqa %xmm0, %xmm7
add r8, r8, r10
@ encrypting, output zeroth round key after transform
vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx)
@ *ring*: Decryption removed.
.Lschedule_go:
cmp $bits, #192 @ cmp \$192, %esi
bhi .Lschedule_256
@ 128: fall though
@@
@@ .schedule_128
@@
@@ 128-bit specific part of key schedule.
@@
@@ This schedule is really simple, because all its parts
@@ are accomplished by the subroutines.
@@
.Lschedule_128:
mov $inp, #10 @ mov \$10, %esi
.Loop_schedule_128:
bl _vpaes_schedule_round
subs $inp, $inp, #1 @ dec %esi
beq .Lschedule_mangle_last
bl _vpaes_schedule_mangle @ write output
b .Loop_schedule_128
@@
@@ .aes_schedule_256
@@
@@ 256-bit specific part of key schedule.
@@
@@ The structure here is very similar to the 128-bit
@@ schedule, but with an additional "low side" in
@@ q6. The low side's rounds are the same as the
@@ high side's, except no rcon and no rotation.
@@
.align 4
.Lschedule_256:
vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
bl _vpaes_schedule_transform @ input transform
mov $inp, #7 @ mov \$7, %esi
.Loop_schedule_256:
bl _vpaes_schedule_mangle @ output low result
vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
@ high round
bl _vpaes_schedule_round
subs $inp, $inp, #1 @ dec %esi
beq .Lschedule_mangle_last
bl _vpaes_schedule_mangle
@ low round. swap xmm7 and xmm6
vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0
vmov.i8 q4, #0
vmov q5, q7 @ vmovdqa %xmm7, %xmm5
vmov q7, q6 @ vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
vmov q7, q5 @ vmovdqa %xmm5, %xmm7
b .Loop_schedule_256
@@
@@ .aes_schedule_mangle_last
@@
@@ Mangler for last round of key schedule
@@ Mangles q0
@@ when encrypting, outputs out(q0) ^ 63
@@ when decrypting, outputs unskew(q0)
@@
@@ Always called right before return... jumps to cleanup and exits
@@
.align 4
.Lschedule_mangle_last:
@ schedule last round key from xmm0
adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew
@ encrypting
vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform
add $out, $out, #32 @ add \$32, %rdx
vmov q2, q0
vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
vtbl.8 q0#hi, {q2}, q1#hi
.Lschedule_mangle_last_dec:
sub $out, $out, #16 @ add \$-16, %rdx
veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform @ output transform
vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key
@ cleanup
veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
ldmia sp!, {r3,pc} @ return
.size _vpaes_schedule_core,.-_vpaes_schedule_core
@@
@@ .aes_schedule_round
@@
@@ Runs one main round of the key schedule on q0, q7
@@
@@ Specifically, runs subbytes on the high dword of q0
@@ then rotates it by one byte and xors into the low dword of
@@ q7.
@@
@@ Adds rcon from low byte of q8, then rotates q8 for
@@ next rcon.
@@
@@ Smears the dwords of q7 by xoring the low into the
@@ second low, result into third, result into highest.
@@
@@ Returns results in q7 = q0.
@@ Clobbers q1-q4, r11.
@@
.type _vpaes_schedule_round,%function
.align 4
_vpaes_schedule_round:
@ extract rcon from xmm8
vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1
vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8
veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
@ rotate
vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0
vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0
@ fall through...
@ low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
@ We pin other values in _vpaes_key_preheat, so load them now.
adr r11, .Lk_sb1
vld1.64 {q14,q15}, [r11]
@ smear xmm7
vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1
veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4
@ subbytes
vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i
veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
vtbl.8 q2#hi, {$invhi}, q1#hi
veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
vtbl.8 q3#hi, {$invlo}, q0#hi
veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
vtbl.8 q4#hi, {$invlo}, q1#hi
veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7
vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
vtbl.8 q3#hi, {$invlo}, q3#hi
veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
vtbl.8 q2#hi, {$invlo}, q4#hi
veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
vtbl.8 q4#hi, {q15}, q3#hi
vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
vtbl.8 q1#hi, {q14}, q2#hi
veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
@ add in smeared stuff
veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
bx lr
.size _vpaes_schedule_round,.-_vpaes_schedule_round
@@
@@ .aes_schedule_transform
@@
@@ Linear-transform q0 according to tables at [r11]
@@
@@ Requires that q9 = 0x0F0F... as in preheat
@@ Output in q0
@@ Clobbers q1, q2, q14, q15
@@
.type _vpaes_schedule_transform,%function
.align 4
_vpaes_schedule_transform:
vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
@ vmovdqa 16(%r11), %xmm1 # hi
vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1
vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0
vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2
vtbl.8 q2#hi, {q14}, q1#hi
vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0
vtbl.8 q0#hi, {q15}, q0#hi
veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
bx lr
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
@@
@@ .aes_schedule_mangle
@@
@@ Mangles q0 from (basis-transformed) standard version
@@ to our version.
@@
@@ On encrypt,
@@ xor with 0x63
@@ multiply by circulant 0,1,1,1
@@ apply shiftrows transform
@@
@@ On decrypt,
@@ xor with 0x63
@@ multiply by "inverse mixcolumns" circulant E,B,D,9
@@ deskew
@@ apply shiftrows transform
@@
@@
@@ Writes out to [r2], and increments or decrements it
@@ Keeps track of round number mod 4 in r8
@@ Preserves q0
@@ Clobbers q1-q5
@@
.type _vpaes_schedule_mangle,%function
.align 4
_vpaes_schedule_mangle:
tst $dir, $dir
vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
@ encrypting
@ Write to q2 so we do not overlap table and destination below.
veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4
add $out, $out, #16 @ add \$16, %rdx
vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4
vtbl.8 q4#hi, {q2}, q5#hi
vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1
vtbl.8 q1#hi, {q4}, q5#hi
vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3
vtbl.8 q3#hi, {q1}, q5#hi
veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
.Lschedule_mangle_both:
@ Write to q2 so table and destination do not overlap.
vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3
vtbl.8 q2#hi, {q3}, q1#hi
add r8, r8, #64-16 @ add \$-16, %r8
and r8, r8, #~(1<<6) @ and \$0x30, %r8
vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx)
bx lr
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
.globl vpaes_set_encrypt_key
.type vpaes_set_encrypt_key,%function
.align 4
vpaes_set_encrypt_key:
stmdb sp!, {r7-r11, lr}
vstmdb sp!, {d8-d15}
lsr r9, $bits, #5 @ shr \$5,%eax
add r9, r9, #5 @ \$5,%eax
str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
mov $dir, #0 @ mov \$0,%ecx
mov r8, #0x30 @ mov \$0x30,%r8d
bl _vpaes_schedule_core
eor r0, r0, r0
vldmia sp!, {d8-d15}
ldmia sp!, {r7-r11, pc} @ return
.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
___
}
{
my ($out, $inp) = map("r$_", (0..1));
my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
$code .= <<___;
@ Additional constants for converting to bsaes.
.type _vpaes_convert_consts,%object
.align 4
_vpaes_convert_consts:
@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
@ transform in the AES S-box. 0x63 is incorporated into the low half of the
@ table. This was computed with the following script:
@
@ def u64s_to_u128(x, y):
@ return x | (y << 64)
@ def u128_to_u64s(w):
@ return w & ((1<<64)-1), w >> 64
@ def get_byte(w, i):
@ return (w >> (i*8)) & 0xff
@ def apply_table(table, b):
@ lo = b & 0xf
@ hi = b >> 4
@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
@ def opt(b):
@ table = [
@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
@ ]
@ return apply_table(table, b)
@ def rot_byte(b, n):
@ return 0xff & ((b << n) | (b >> (8-n)))
@ def skew(x):
@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
@ rot_byte(x, 4))
@ table = [0, 0]
@ for i in range(16):
@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
@ table[1] |= skew(opt(i<<4)) << (i*8)
@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
.Lk_opt_then_skew:
.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
.quad 0x1f30062936192f00, 0xb49bad829db284ab
@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
.globl vpaes_encrypt_key_to_bsaes
.type vpaes_encrypt_key_to_bsaes,%function
.align 4
vpaes_encrypt_key_to_bsaes:
stmdb sp!, {r11, lr}
@ See _vpaes_schedule_core for the key schedule logic. In particular,
@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
@ contain the transformations not in the bsaes representation. This
@ function inverts those transforms.
@
@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
@ representation, which does not match the other aes_nohw_*
@ implementations. The ARM aes_nohw_* stores each 32-bit word
@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
@ cost of extra REV and VREV32 operations in little-endian ARM.
vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform
adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
vld1.64 {$mc_forward}, [r2]
vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64
adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied
@ vpaes stores one fewer round count than bsaes, but the number of keys
@ is the same.
ldr r2, [$inp,#240]
add r2, r2, #1
str r2, [$out,#240]
@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
@ Invert this with .Lk_opt.
vld1.64 {q0}, [$inp]!
bl _vpaes_schedule_transform
vrev32.8 q0, q0
vst1.64 {q0}, [$out]!
@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
.Loop_enc_key_to_bsaes:
vld1.64 {q0}, [$inp]!
@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
@ We use r3 rather than r8 to avoid a callee-saved register.
vld1.64 {q1}, [r3]
vtbl.8 q2#lo, {q0}, q1#lo
vtbl.8 q2#hi, {q0}, q1#hi
add r3, r3, #16
and r3, r3, #~(1<<6)
vmov q0, q2
@ Handle the last key differently.
subs r2, r2, #1
beq .Loop_enc_key_to_bsaes_last
@ Multiply by the circulant. This is its own inverse.
vtbl.8 q1#lo, {q0}, $mc_forward#lo
vtbl.8 q1#hi, {q0}, $mc_forward#hi
vmov q0, q1
vtbl.8 q2#lo, {q1}, $mc_forward#lo
vtbl.8 q2#hi, {q1}, $mc_forward#hi
veor q0, q0, q2
vtbl.8 q1#lo, {q2}, $mc_forward#lo
vtbl.8 q1#hi, {q2}, $mc_forward#hi
veor q0, q0, q1
@ XOR and finish.
veor q0, q0, $s63
bl _vpaes_schedule_transform
vrev32.8 q0, q0
vst1.64 {q0}, [$out]!
b .Loop_enc_key_to_bsaes
.Loop_enc_key_to_bsaes_last:
@ The final key does not have a basis transform (note
@ .Lschedule_mangle_last inverts the original transform). It only XORs
@ 0x63 and applies ShiftRows. The latter was already inverted in the
@ loop. Note that, because we act on the original representation, we use
@ $s63_raw, not $s63.
veor q0, q0, $s63_raw
vrev32.8 q0, q0
vst1.64 {q0}, [$out]
@ Wipe registers which contained key material.
veor q0, q0, q0
veor q1, q1, q1
veor q2, q2, q2
ldmia sp!, {r11, pc} @ return
.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
___
}
{
# Register-passed parameters.
my ($inp, $out, $len, $key) = map("r$_", 0..3);
# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and
# $tmp. $ctr is r7 because it must be preserved across calls.
my ($ctr, $ivec, $tmp) = map("r$_", 7..9);
# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
# const AES_KEY *key, const uint8_t ivec[16]);
$code .= <<___;
.globl vpaes_ctr32_encrypt_blocks
.type vpaes_ctr32_encrypt_blocks,%function
.align 4
vpaes_ctr32_encrypt_blocks:
mov ip, sp
stmdb sp!, {r7-r11, lr}
@ This function uses q4-q7 (d8-d15), which are callee-saved.
vstmdb sp!, {d8-d15}
cmp $len, #0
@ $ivec is passed on the stack.
ldr $ivec, [ip]
beq .Lctr32_done
@ _vpaes_encrypt_core expects the key in r2, so swap $len and $key.
mov $tmp, $key
mov $key, $len
mov $len, $tmp
___
my ($len, $key) = ($key, $len);
$code .= <<___;
@ Load the IV and counter portion.
ldr $ctr, [$ivec, #12]
vld1.8 {q7}, [$ivec]
bl _vpaes_preheat
rev $ctr, $ctr @ The counter is big-endian.
.Lctr32_loop:
vmov q0, q7
vld1.8 {q6}, [$inp]! @ Load input ahead of time
bl _vpaes_encrypt_core
veor q0, q0, q6 @ XOR input and result
vst1.8 {q0}, [$out]!
subs $len, $len, #1
@ Update the counter.
add $ctr, $ctr, #1
rev $tmp, $ctr
vmov.32 q7#hi[1], $tmp
bne .Lctr32_loop
.Lctr32_done:
vldmia sp!, {d8-d15}
ldmia sp!, {r7-r11, pc} @ return
.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
___
}
foreach (split("\n",$code)) {
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,824 @@
#! /usr/bin/env perl
# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
##
######################################################################
# ARMv8 NEON adaptation by <appro@openssl.org>
#
# Reason for undertaken effort is that there is at least one popular
# SoC based on Cortex-A53 that doesn't have crypto extensions.
#
# CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
#
# (*) ECB denotes approximate result for parallelizable modes
# such as CBC decrypt, CTR, etc.;
# (**) these results are worse than scalar compiler-generated
# code, but it's constant-time and therefore preferred;
# (***) presented for reference/comparison purposes;
$flavour = shift;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$code.=<<___;
.section .rodata
.type _vpaes_consts,%object
.align 7 // totally strategic alignment
_vpaes_consts:
.Lk_mc_forward: // mc_forward
.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
.quad 0x080B0A0904070605, 0x000302010C0F0E0D
.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
.quad 0x000302010C0F0E0D, 0x080B0A0904070605
.Lk_mc_backward:// mc_backward
.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
.quad 0x020100030E0D0C0F, 0x0A09080B06050407
.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
.quad 0x0A09080B06050407, 0x020100030E0D0C0F
.Lk_sr: // sr
.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
.quad 0x030E09040F0A0500, 0x0B06010C07020D08
.quad 0x0F060D040B020900, 0x070E050C030A0108
.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
//
// "Hot" constants
//
.Lk_inv: // inv, inva
.quad 0x0E05060F0D080180, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809
.Lk_ipt: // input transform (lo, hi)
.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
.Lk_sbo: // sbou, sbot
.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
.Lk_sb1: // sb1u, sb1t
.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
.Lk_sb2: // sb2u, sb2t
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
//
// Key schedule constants
//
.Lk_dksd: // decryption key schedule: invskew x*D
.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
.Lk_dksb: // decryption key schedule: invskew x*B
.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
.Lk_dkse: // decryption key schedule: invskew x*E + 0x63
.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
.Lk_dks9: // decryption key schedule: invskew x*9
.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
.Lk_rcon: // rcon
.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
.Lk_opt: // output transform
.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
.Lk_deskew: // deskew tables: inverts the sbox's "skew"
.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
.size _vpaes_consts,.-_vpaes_consts
.align 6
.text
___
{
my ($inp,$out,$key) = map("x$_",(0..2));
my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
$code.=<<___;
##
## _aes_preheat
##
## Fills register %r10 -> .aes_consts (so you can -fPIC)
## and %xmm9-%xmm15 as specified below.
##
.type _vpaes_encrypt_preheat,%function
.align 4
_vpaes_encrypt_preheat:
adrp x10, :pg_hi21:.Lk_inv
add x10, x10, :lo12:.Lk_inv
movi v17.16b, #0x0f
ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
ret
.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm9-%xmm15 as in _vpaes_preheat
## (%rdx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
## Preserves %xmm6 - %xmm8 so you get some local vectors
##
##
.type _vpaes_encrypt_core,%function
.align 4
_vpaes_encrypt_core:
mov x9, $key
ldr w8, [$key,#240] // pull rounds
adrp x11, :pg_hi21:.Lk_mc_forward+16
add x11, x11, :lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
// vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
b .Lenc_entry
.align 4
.Lenc_loop:
// middle of middle round
add x10, x11, #0x40
tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
sub w8, w8, #1 // nr--
.Lenc_entry:
// top of round
and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
cbnz w8, .Lenc_loop
// middle of last round
add x10, x11, #0x80
// vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
// vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
.type _vpaes_encrypt_2x,%function
.align 4
_vpaes_encrypt_2x:
mov x9, $key
ldr w8, [$key,#240] // pull rounds
adrp x11, :pg_hi21:.Lk_mc_forward+16
add x11, x11, :lo12:.Lk_mc_forward+16
// vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
and v9.16b, v15.16b, v17.16b
ushr v8.16b, v15.16b, #4
tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
tbl v9.16b, {$iptlo}, v9.16b
// vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
tbl v10.16b, {$ipthi}, v8.16b
eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
eor v8.16b, v9.16b, v16.16b
eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
eor v8.16b, v8.16b, v10.16b
b .Lenc_2x_entry
.align 4
.Lenc_2x_loop:
// middle of middle round
add x10, x11, #0x40
tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
tbl v12.16b, {$sb1t}, v10.16b
ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
tbl v8.16b, {$sb1u}, v11.16b
eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
eor v12.16b, v12.16b, v16.16b
tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
tbl v13.16b, {$sb2t}, v10.16b
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
eor v8.16b, v8.16b, v12.16b
tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
tbl v10.16b, {$sb2u}, v11.16b
ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
tbl v11.16b, {v8.16b}, v1.16b
eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
eor v10.16b, v10.16b, v13.16b
tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
tbl v8.16b, {v8.16b}, v4.16b
eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
eor v11.16b, v11.16b, v10.16b
tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
tbl v12.16b, {v11.16b},v1.16b
eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
eor v8.16b, v8.16b, v11.16b
and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
eor v8.16b, v8.16b, v12.16b
sub w8, w8, #1 // nr--
.Lenc_2x_entry:
// top of round
and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
and v9.16b, v8.16b, v17.16b
ushr v8.16b, v8.16b, #4
tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
tbl v13.16b, {$invhi},v9.16b
eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
eor v9.16b, v9.16b, v8.16b
tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
tbl v11.16b, {$invlo},v8.16b
tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
tbl v12.16b, {$invlo},v9.16b
eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
eor v11.16b, v11.16b, v13.16b
eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
eor v12.16b, v12.16b, v13.16b
tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
tbl v10.16b, {$invlo},v11.16b
tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
tbl v11.16b, {$invlo},v12.16b
eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
eor v10.16b, v10.16b, v9.16b
eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
eor v11.16b, v11.16b, v8.16b
ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
cbnz w8, .Lenc_2x_loop
// middle of last round
add x10, x11, #0x80
// vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
// vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
tbl v12.16b, {$sbou}, v10.16b
ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
tbl v8.16b, {$sbot}, v11.16b
eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
eor v12.16b, v12.16b, v16.16b
eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
eor v8.16b, v8.16b, v12.16b
tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
tbl v1.16b, {v8.16b},v1.16b
ret
.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
___
}
{
my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
$code.=<<___;
########################################################
## ##
## AES key schedule ##
## ##
########################################################
.type _vpaes_key_preheat,%function
.align 4
_vpaes_key_preheat:
adrp x10, :pg_hi21:.Lk_inv
add x10, x10, :lo12:.Lk_inv
movi v16.16b, #0x5b // .Lk_s63
adrp x11, :pg_hi21:.Lk_sb1
add x11, x11, :lo12:.Lk_sb1
movi v17.16b, #0x0f // .Lk_s0F
ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
adrp x10, :pg_hi21:.Lk_dksd
add x10, x10, :lo12:.Lk_dksd
ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
adrp x11, :pg_hi21:.Lk_mc_forward
add x11, x11, :lo12:.Lk_mc_forward
ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
ld1 {v8.2d}, [x10] // .Lk_rcon
ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
ret
.size _vpaes_key_preheat,.-_vpaes_key_preheat
.type _vpaes_schedule_core,%function
.align 4
_vpaes_schedule_core:
AARCH64_SIGN_LINK_REGISTER
stp x29, x30, [sp,#-16]!
add x29,sp,#0
bl _vpaes_key_preheat // load the tables
ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
// input transform
mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
bl _vpaes_schedule_transform
mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10
add x10, x10, :lo12:.Lk_sr
add x8, x8, x10
// encrypting, output zeroth round key after transform
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
cmp $bits, #192 // cmp \$192, %esi
b.hi .Lschedule_256
b.eq .Lschedule_192
// 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
.Lschedule_128:
mov $inp, #10 // mov \$10, %esi
.Loop_schedule_128:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
bl _vpaes_schedule_mangle // write output
b .Loop_schedule_128
##
## .aes_schedule_192
##
## 192-bit specific part of key schedule.
##
## The main body of this schedule is the same as the 128-bit
## schedule, but with more smearing. The long, high side is
## stored in %xmm7 as before, and the short, low side is in
## the high bits of %xmm6.
##
## This schedule is somewhat nastier, however, because each
## round produces 192 bits of key material, or 1.5 round keys.
## Therefore, on each cycle we do 2 rounds and produce 3 round
## keys.
##
.align 4
.Lschedule_192:
sub $inp, $inp, #8
ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
bl _vpaes_schedule_transform // input transform
mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
mov $inp, #4 // mov \$4, %esi
.Loop_schedule_192:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_round
ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
bl _vpaes_schedule_mangle // save key n
bl _vpaes_schedule_192_smear
bl _vpaes_schedule_mangle // save key n+1
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
bl _vpaes_schedule_mangle // save key n+2
bl _vpaes_schedule_192_smear
b .Loop_schedule_192
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
.align 4
.Lschedule_256:
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
bl _vpaes_schedule_transform // input transform
mov $inp, #7 // mov \$7, %esi
.Loop_schedule_256:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_mangle // output low result
mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
// high round
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
bl _vpaes_schedule_mangle
// low round. swap xmm7 and xmm6
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
movi v4.16b, #0
mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
b .Loop_schedule_256
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
.align 4
.Lschedule_mangle_last:
// schedule last round key from xmm0
adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
add x11, x11, :lo12:.Lk_deskew
cbnz $dir, .Lschedule_mangle_last_dec
// encrypting
ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
add x11, x11, :lo12:.Lk_opt
add $out, $out, #32 // add \$32, %rdx
tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
.Lschedule_mangle_last_dec:
ld1 {v20.2d-v21.2d}, [x11] // reload constants
sub $out, $out, #16 // add \$-16, %rdx
eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform // output transform
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
// cleanup
eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
ldp x29, x30, [sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size _vpaes_schedule_core,.-_vpaes_schedule_core
##
## .aes_schedule_192_smear
##
## Smear the short, low side in the 192-bit key schedule.
##
## Inputs:
## %xmm7: high side, b a x y
## %xmm6: low side, d c 0 0
## %xmm13: 0
##
## Outputs:
## %xmm6: b+c+d b+c 0 0
## %xmm0: b+c+d b+c b a
##
.type _vpaes_schedule_192_smear,%function
.align 4
_vpaes_schedule_192_smear:
movi v1.16b, #0
dup v0.4s, v7.s[3]
ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
ret
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm4, %r11.
##
.type _vpaes_schedule_round,%function
.align 4
_vpaes_schedule_round:
// extract rcon from xmm8
movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
// rotate
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
// fall through...
// low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
// smear xmm7
ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
// subbytes
and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
// add in smeared stuff
eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%r11)
##
## Requires that %xmm9 = 0x0F0F... as in preheat
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
.type _vpaes_schedule_transform,%function
.align 4
_vpaes_schedule_transform:
and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
// vmovdqa (%r11), %xmm2 # lo
tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
// vmovdqa 16(%r11), %xmm1 # hi
tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
ret
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%rdx), and increments or decrements it
## Keeps track of round number mod 4 in %r8
## Preserves xmm0
## Clobbers xmm1-xmm5
##
.type _vpaes_schedule_mangle,%function
.align 4
_vpaes_schedule_mangle:
mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
// vmovdqa .Lk_mc_forward(%rip),%xmm5
// encrypting
eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
add $out, $out, #16 // add \$16, %rdx
tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
.Lschedule_mangle_both:
tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
add x8, x8, #48 // add \$-16, %r8
and x8, x8, #~(1<<6) // and \$0x30, %r8
st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
ret
.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
.globl vpaes_set_encrypt_key
.type vpaes_set_encrypt_key,%function
.align 4
vpaes_set_encrypt_key:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
add x29,sp,#0
stp d8,d9,[sp,#-16]! // ABI spec says so
lsr w9, $bits, #5 // shr \$5,%eax
add w9, w9, #5 // \$5,%eax
str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
mov $dir, #0 // mov \$0,%ecx
mov x8, #0x30 // mov \$0x30,%r8d
bl _vpaes_schedule_core
eor x0, x0, x0
ldp d8,d9,[sp],#16
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
___
}
{
my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4));
my ($ctr, $ctr_tmp) = ("w6", "w7");
# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len,
# const AES_KEY *key, const uint8_t ivec[16]);
$code.=<<___;
.globl vpaes_ctr32_encrypt_blocks
.type vpaes_ctr32_encrypt_blocks,%function
.align 4
vpaes_ctr32_encrypt_blocks:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
add x29,sp,#0
stp d8,d9,[sp,#-16]! // ABI spec says so
stp d10,d11,[sp,#-16]!
stp d12,d13,[sp,#-16]!
stp d14,d15,[sp,#-16]!
cbz $len, .Lctr32_done
// Note, unlike the other functions, $len here is measured in blocks,
// not bytes.
mov x17, $len
mov x2, $key
// Load the IV and counter portion.
ldr $ctr, [$ivec, #12]
ld1 {v7.16b}, [$ivec]
bl _vpaes_encrypt_preheat
tst x17, #1
rev $ctr, $ctr // The counter is big-endian.
b.eq .Lctr32_prep_loop
// Handle one block so the remaining block count is even for
// _vpaes_encrypt_2x.
ld1 {v6.16b}, [$inp], #16 // Load input ahead of time
bl _vpaes_encrypt_core
eor v0.16b, v0.16b, v6.16b // XOR input and result
st1 {v0.16b}, [$out], #16
subs x17, x17, #1
// Update the counter.
add $ctr, $ctr, #1
rev $ctr_tmp, $ctr
mov v7.s[3], $ctr_tmp
b.ls .Lctr32_done
.Lctr32_prep_loop:
// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
// uses v14 and v15.
mov v15.16b, v7.16b
mov v14.16b, v7.16b
add $ctr, $ctr, #1
rev $ctr_tmp, $ctr
mov v15.s[3], $ctr_tmp
.Lctr32_loop:
ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time
bl _vpaes_encrypt_2x
eor v0.16b, v0.16b, v6.16b // XOR input and result
eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
st1 {v0.16b,v1.16b}, [$out], #32
subs x17, x17, #2
// Update the counter.
add $ctr_tmp, $ctr, #1
add $ctr, $ctr, #2
rev $ctr_tmp, $ctr_tmp
mov v14.s[3], $ctr_tmp
rev $ctr_tmp, $ctr
mov v15.s[3], $ctr_tmp
b.hi .Lctr32_loop
.Lctr32_done:
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
ldp x29,x30,[sp],#16
AARCH64_VALIDATE_LINK_REGISTER
ret
.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
___
}
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,617 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
# Atom 70.7/92.1/60.1 61.1/75.4(***)
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +28%/64% improvement on Core 2
# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0]);
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
&external_label("BORINGSSL_function_hit");
&preprocessor_endif();
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&pand ("xmm0","xmm6");
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pxor ("xmm2","xmm5");
&psrld ("xmm1",4);
&add ($key,16);
&pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
&pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
&pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&sub ($round,1); # nr--
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
# 192-bit key support was removed.
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("${PREFIX}_set_encrypt_key");
record_function_hit(5);
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_encrypt_key");
&function_begin("${PREFIX}_encrypt");
record_function_hit(4);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_encrypt");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,739 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.global bn_mul_mont_nohw
.type bn_mul_mont_nohw,%function
.align 5
bn_mul_mont_nohw:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
cmp ip,#2
mov $num,ip @ load num
#ifdef __thumb2__
ittt lt
#endif
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
mov $tj,sp
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,$tj @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
#ifdef __thumb2__
itt ne
#endif
movne $tj,sp
bne .Louter
ldr $rp,[$_rp] @ pull rp
mov $aj,sp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,$aj @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
.Lcopy: ldr $tj,[$tp] @ conditional copy
ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
#ifdef __thumb2__
it cc
#endif
movcc $aj,$tj
str $aj,[$rp],#4
teq $tp,$num @ preserve carry
bne .Lcopy
mov sp,$num
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
___
{
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero="$Z#lo";
my $temp="$Temp#lo";
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global bn_mul8x_mont_neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
mov ip,sp
cmp $num,#8
bhi .LNEON_8n
@ special case for $num==8, everything is in register bank...
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
sub $toutptr,sp,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
vzip.16 $Bi,$zero
vmull.u32 @ACC[0],$Bi,${A0}[0]
vmull.u32 @ACC[1],$Bi,${A0}[1]
vmull.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmull.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
vmul.u32 $Ni,$Ni,$M0
vmull.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 @ACC[5],$Bi,${A2}[1]
vmull.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
mov $toutptr,sp
vshr.u64 $temp,@ACC[0]#lo,#16
mov $inner,$num
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
add $tinptr,sp,#96
vshr.u64 $temp,@ACC[0]#hi,#16
vzip.16 @ACC[0]#lo,@ACC[0]#hi
b .LNEON_tail_entry
.align 4
.LNEON_8n:
veor @ACC[0],@ACC[0],@ACC[0]
sub $toutptr,sp,#128
veor @ACC[1],@ACC[1],@ACC[1]
sub $toutptr,$toutptr,$num,lsl#4
veor @ACC[2],@ACC[2],@ACC[2]
and $toutptr,$toutptr,#-64
veor @ACC[3],@ACC[3],@ACC[3]
mov sp,$toutptr @ alloca
veor @ACC[4],@ACC[4],@ACC[4]
add $toutptr,$toutptr,#256
veor @ACC[5],@ACC[5],@ACC[5]
sub $inner,$num,#8
veor @ACC[6],@ACC[6],@ACC[6]
veor @ACC[7],@ACC[7],@ACC[7]
.LNEON_8n_init:
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
subs $inner,$inner,#8
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
bne .LNEON_8n_init
add $tinptr,sp,#256
vld1.32 {$A0-$A3},[$aptr]!
add $bnptr,sp,#8
vld1.32 {${M0}[0]},[$n0,:32]
mov $outer,$num
b .LNEON_8n_outer
.align 4
.LNEON_8n_outer:
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
add $toutptr,sp,#128
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
vmlal.u32 @ACC[0],$Ni,${N0}[0]
veor $temp,$temp,$temp
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vzip.16 $Bi,$temp
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
___
push(@ACC,shift(@ACC)); $i++;
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]!
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
add $bnptr,sp,#8 @ rewind
___
push(@ACC,shift(@ACC));
$code.=<<___;
sub $inner,$num,#8
b .LNEON_8n_inner
.align 4
.LNEON_8n_inner:
subs $inner,$inner,#8
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[3],$Bi,${A1}[1]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vst1.64 {@ACC[0]},[$toutptr,:128]!
___
push(@ACC,shift(@ACC));
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
it eq
subeq $aptr,$aptr,$num,lsl#2 @ rewind
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[2],$Ni,${N1}[0]
add $bnptr,sp,#8 @ rewind
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 @ACC[7],$Ni,${N3}[1]
bne .LNEON_8n_inner
___
push(@ACC,shift(@ACC));
$code.=<<___;
add $tinptr,sp,#128
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
veor q2,q2,q2 @ $N0-$N1
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
veor q3,q3,q3 @ $N2-$N3
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]},[$toutptr,:128]
subs $outer,$outer,#8
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
itt ne
subne $nptr,$nptr,$num,lsl#2 @ rewind
bne .LNEON_8n_outer
add $toutptr,sp,#128
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 $temp,@ACC[0]#lo,#16
vst1.64 {q2-q3},[sp,:256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vst1.64 {q2-q3}, [sp,:256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vst1.64 {q2-q3}, [sp,:256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
mov $inner,$num
b .LNEON_tail_entry
.align 4
.LNEON_tail:
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vshr.u64 $temp,@ACC[0]#lo,#16
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
.LNEON_tail_entry:
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
vshr.u64 $temp,@ACC[1]#lo,#16
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
vshr.u64 $temp,@ACC[1]#hi,#16
vzip.16 @ACC[1]#lo,@ACC[1]#hi
___
push(@ACC,shift(@ACC));
}
push(@ACC,shift(@ACC));
$code.=<<___;
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
subs $inner,$inner,#8
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
mov r11,sp
veor q0,q0,q0
sub r11,$bptr,r11 @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
mov sp,ip
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
s/\bret\b/bx lr/g or
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,334 @@
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=1;
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arranging 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("ebp","edx"); # this splits them apart modulo 4096
&and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","esp");
&sub ("eax","ebp");
&and ("eax",-4096);
&mov ("edx","esp"); # saved stack pointer!
&lea ("esp",&DWP(0,"ebp","eax"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&jmp (&label("page_walk_done"));
&set_label("page_walk",16);
&lea ("esp",&DWP(-4096,"esp"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&mov ("edx",-1);
&xor ("edx","eax");
&jmp (&label("copy"));
&set_label("copy",16); # conditional copy
&mov ($tp,&DWP($frame,"esp",$num,4));
&mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&and ($tp,"eax");
&and ($np,"edx");
&or ($np,$tp);
&mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,154 @@
// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef OPENSSL_HEADER_BN_INTERNAL_H
#define OPENSSL_HEADER_BN_INTERNAL_H
#include <ring-core/base.h>
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
#pragma warning(push, 3)
#include <intrin.h>
#pragma warning(pop)
#pragma intrinsic(_umul128)
#endif
#include "../../internal.h"
typedef crypto_word_t BN_ULONG;
#if defined(OPENSSL_64_BIT)
#if defined(BORINGSSL_HAS_UINT128)
// MSVC doesn't support two-word integers on 64-bit.
#define BN_ULLONG uint128_t
#endif
#define BN_BITS2 64
#define BN_MONT_CTX_N0_LIMBS 1
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo), 0
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
#elif defined(OPENSSL_32_BIT)
#define BN_ULLONG uint64_t
#define BN_BITS2 32
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
// needs to be two words long. Only certain 32-bit platforms actually make use
// of n0[1] and shorter R value would suffice for the others. However,
// currently only the assembly files know which is which.
#define BN_MONT_CTX_N0_LIMBS 2
#define BN_MONT_CTX_N0(hi, lo) TOBN(hi, lo)
#define TOBN(hi, lo) (lo), (hi)
#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif
// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM|
// used with Montgomery reduction. Ideally this limit would be applied to all
// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB
// values for other operations.
// #define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG))
// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to
// an |N0|.
//
// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
// |BN_MONTGOMERY_MAX_WORDS|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
// not. |size_t| is the safer option but not strictly correct for x86_64. But
// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
//
// |num| must be at least 4, at least on x86.
//
// In other forks, |bn_mul_mont| returns an |int| indicating whether it
// actually did the multiplication. All our implementations always do the
// multiplication, and forcing callers to deal with the possibility of it
// failing just leads to further problems.
OPENSSL_STATIC_ASSERT(sizeof(int) == sizeof(size_t) ||
(sizeof(int) == 4 && sizeof(size_t) == 8),
"int and size_t ABI mismatch");
#if defined(OPENSSL_X86_64)
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#elif defined(OPENSSL_AARCH64)
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
// No point in optimizing for P-256 because P-256 doesn't call into
// this on AArch64.
bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#elif defined(OPENSSL_ARM)
void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
// Approximate what `bn_mul_mont` did so that the NEON version for P-256
// when practical.
if (num == 8) {
// XXX: This should not be accessing `neon_available` directly.
if (neon_available) {
bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
return;
}
}
bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#else
void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
static inline void bn_mul_mont_small(
BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
bn_mul_mont(rp, ap, bp, np, n0, num);
}
#endif
static inline void bn_umult_lohi(BN_ULONG *low_out, BN_ULONG *high_out,
BN_ULONG a, BN_ULONG b) {
#if defined(OPENSSL_X86_64) && defined(_MSC_VER) && !defined(__clang__)
*low_out = _umul128(a, b, high_out);
#else
BN_ULLONG result = (BN_ULLONG)a * b;
*low_out = (BN_ULONG)result;
*high_out = (BN_ULONG)(result >> BN_BITS2);
#endif
}
#endif // OPENSSL_HEADER_BN_INTERNAL_H

View File

@@ -0,0 +1,64 @@
// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "internal.h"
#include "../../internal.h"
#include "../../limbs/limbs.h"
#include "../../limbs/limbs.inl"
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
"BN_MONT_CTX_N0_LIMBS value is invalid");
OPENSSL_STATIC_ASSERT(
sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
"uint64_t is insufficient precision for n0");
int bn_from_montgomery_in_place(BN_ULONG r[], size_t num_r, BN_ULONG a[],
size_t num_a, const BN_ULONG n[],
size_t num_n,
const BN_ULONG n0_[BN_MONT_CTX_N0_LIMBS]) {
if (num_n == 0 || num_r != num_n || num_a != 2 * num_n) {
return 0;
}
// Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
// input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
// includes |carry| which is stored separately.
BN_ULONG n0 = n0_[0];
BN_ULONG carry = 0;
for (size_t i = 0; i < num_n; i++) {
BN_ULONG v = limbs_mul_add_limb(a + i, n, a[i] * n0, num_n);
v += carry + a[i + num_n];
carry |= (v != a[i + num_n]);
carry &= (v <= a[i + num_n]);
a[i + num_n] = v;
}
// Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
// includes |carry| which is stored separately.
a += num_n;
// |a| thus requires at most one additional subtraction |n| to be reduced.
// Subtract |n| and select the answer in constant time.
BN_ULONG v = limbs_sub(r, a, n, num_n) - carry;
// |v| is one if |a| - |n| underflowed or zero if it did not. Note |v| cannot
// be -1. That would imply the subtraction did not fit in |num_n| words, and
// we know at most one subtraction is needed.
v = 0u - v;
for (size_t i = 0; i < num_n; i++) {
r[i] = constant_time_select_w(v, a[i], r[i]);
a[i] = 0;
}
return 1;
}

View File

@@ -0,0 +1,105 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "internal.h"
#include "../../internal.h"
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
"BN_MONT_CTX_N0_LIMBS value is invalid");
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
"uint64_t is insufficient precision for n0");
// LG_LITTLE_R is log_2(r).
#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
// must be odd.
//
// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
//
// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
// constant-time with respect to |n|. We assume uint64_t additions,
// subtractions, shifts, and bitwise operations are all constant time, which
// may be a large leap of faith on 32-bit targets. We avoid division and
// multiplication, which tend to be the most problematic in terms of timing
// leaks.
//
// Most GCD implementations return values such that |u*r + v*n == 1|, so the
// caller would have to negate the resultant |v| for the purpose of Montgomery
// multiplication. This implementation does the negation implicitly by doing
// the computations as a difference instead of a sum.
uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
dev_assert_secret(n % 2 == 1);
// alpha == 2**(lg r - 1) == r / 2.
static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
const uint64_t beta = n;
uint64_t u = 1;
uint64_t v = 0;
// The invariant maintained from here on is:
// 2**(lg r - i) == u*2*alpha - v*beta.
for (size_t i = 0; i < LG_LITTLE_R; ++i) {
#if BN_BITS2 == 64 && defined(BN_ULLONG)
dev_assert_secret((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
// Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
// |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0.
// The addition can overflow, so use Dietz's method for it.
//
// Dietz calculates (x+y)/2 by (x xor y)>>1 + x&y. This is valid for all
// (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
// (embedded in 64 bits to so that overflow can be ignored):
//
// (declare-fun x () (_ BitVec 64))
// (declare-fun y () (_ BitVec 64))
// (assert (let (
// (one (_ bv1 64))
// (thirtyTwo (_ bv32 64)))
// (and
// (bvult x (bvshl one thirtyTwo))
// (bvult y (bvshl one thirtyTwo))
// (not (=
// (bvadd (bvlshr (bvxor x y) one) (bvand x y))
// (bvlshr (bvadd x y) one)))
// )))
// (check-sat)
uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0.
u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
uint64_t alpha_if_u_is_odd = alpha & u_is_odd; /* Either |alpha| or 0. */
v = (v >> 1) + alpha_if_u_is_odd;
}
// The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
#if BN_BITS2 == 64 && defined(BN_ULLONG)
declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
return v;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,52 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "ecp_nistz.h"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wconversion"
#endif
/* Fills |str| with the bytewise little-endian encoding of |scalar|, where
* |scalar| has |num_limbs| limbs. |str| is padded with zeros at the end up
* to |str_len| bytes. Actually, |str_len| must be exactly one byte more than
* needed to encode |num_limbs| losslessly, so that there is an extra byte at
* the end. The extra byte is useful because the caller will be breaking |str|
* up into windows of a number of bits (5 or 7) that isn't divisible by 8, and
* so it is useful for it to be able to read an extra zero byte. */
void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
const Limb scalar[],
size_t num_limbs) {
debug_assert_nonsecret(str_len == (num_limbs * sizeof(Limb)) + 1);
size_t i;
for (i = 0; i < num_limbs * sizeof(Limb); i += sizeof(Limb)) {
Limb d = scalar[i / sizeof(Limb)];
str[i + 0] = d & 0xff;
str[i + 1] = (d >> 8) & 0xff;
str[i + 2] = (d >> 16) & 0xff;
str[i + 3] = (d >>= 24) & 0xff;
if (sizeof(Limb) == 8) {
d >>= 8;
str[i + 4] = d & 0xff;
str[i + 5] = (d >> 8) & 0xff;
str[i + 6] = (d >> 16) & 0xff;
str[i + 7] = (d >> 24) & 0xff;
}
}
for (; i < str_len; i++) {
str[i] = 0;
}
}

View File

@@ -0,0 +1,274 @@
/* Copyright (c) 2015, Google Inc.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ_H
#define OPENSSL_HEADER_EC_ECP_NISTZ_H
#include <ring-core/base.h>
#include "../../limbs/limbs.h"
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
// This function looks at `w + 1` scalar bits (`w` current, 1 adjacent less
// significant bit), and recodes them into a signed digit for use in fast point
// multiplication: the use of signed rather than unsigned digits means that
// fewer points need to be precomputed, given that point inversion is easy (a
// precomputed point dP makes -dP available as well).
//
// BACKGROUND:
//
// Signed digits for multiplication were introduced by Booth ("A signed binary
// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
// Booth's original encoding did not generally improve the density of nonzero
// digits over the binary representation, and was merely meant to simplify the
// handling of signed factors given in two's complement; but it has since been
// shown to be the basis of various signed-digit representations that do have
// further advantages, including the wNAF, using the following general
// approach:
//
// (1) Given a binary representation
//
// b_k ... b_2 b_1 b_0,
//
// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
// by using bit-wise subtraction as follows:
//
// b_k b_(k-1) ... b_2 b_1 b_0
// - b_k ... b_3 b_2 b_1 b_0
// -----------------------------------------
// s_(k+1) s_k ... s_3 s_2 s_1 s_0
//
// A left-shift followed by subtraction of the original value yields a new
// representation of the same value, using signed bits s_i = b_(i-1) - b_i.
// This representation from Booth's paper has since appeared in the
// literature under a variety of different names including "reversed binary
// form", "alternating greedy expansion", "mutual opposite form", and
// "sign-alternating {+-1}-representation".
//
// An interesting property is that among the nonzero bits, values 1 and -1
// strictly alternate.
//
// (2) Various window schemes can be applied to the Booth representation of
// integers: for example, right-to-left sliding windows yield the wNAF
// (a signed-digit encoding independently discovered by various researchers
// in the 1990s), and left-to-right sliding windows yield a left-to-right
// equivalent of the wNAF (independently discovered by various researchers
// around 2004).
//
// To prevent leaking information through side channels in point multiplication,
// we need to recode the given integer into a regular pattern: sliding windows
// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
// decades older: we'll be using the so-called "modified Booth encoding" due to
// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
// (1961), pp. 67-91), in a radix-2**w setting. That is, we always combine `w`
// signed bits into a signed digit, e.g. (for `w == 5`):
//
// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
//
// The sign-alternating property implies that the resulting digit values are
// integers from `-2**(w-1)` to `2**(w-1)`, e.g. -16 to 16 for `w == 5`.
//
// Of course, we don't actually need to compute the signed digits s_i as an
// intermediate step (that's just a nice way to see how this scheme relates
// to the wNAF): a direct computation obtains the recoded digit from the
// six bits b_(5j + 4) ... b_(5j - 1).
//
// This function takes those `w` bits as an integer (e.g. 0 .. 63), writing the
// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
// value, in the range 0 .. 2**(w-1). Note that this integer essentially provides
// the input bits "shifted to the left" by one position: for example, the input
// to compute the least significant recoded digit, given that there's no bit
// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
//
// DOUBLING CASE:
//
// Point addition formulas for short Weierstrass curves are often incomplete.
// Edge cases such as P + P or P + ∞ must be handled separately. This
// complicates constant-time requirements. P + ∞ cannot be avoided (any window
// may be zero) and is handled with constant-time selects. P + P (where P is not
// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
// case. Whether this happens depends on the group order.
//
// Let w be the window width (in this function, w = 5). The non-trivial doubling
// case in single-point scalar multiplication may occur if and only if the
// 2^(w-1) bit of the group order is zero.
//
// Note the above only holds if the scalar is fully reduced and the group order
// is a prime that is much larger than 2^w. It also only holds when windows
// are applied from most significant to least significant, doubling between each
// window. It does not apply to more complex table strategies such as
// |EC_nistz256_method|.
//
// PROOF:
//
// Let n be the group order. Let l be the number of bits needed to represent n.
// Assume there exists some 0 <= k < n such that signed w-bit windowed
// multiplication hits the doubling case.
//
// Windowed multiplication consists of iterating over groups of s_i (defined
// above based on k's binary representation) from most to least significant. At
// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
// window), we:
//
// 1. Double the accumulator A, w times. Let A_i be the value of A at this
// point.
//
// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
// corresponding to the window s_(i+w-1) ... s_i.
//
// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
// 2^w * (a_(i+w) + t_(i+w)).
//
// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
// This is computed as:
//
// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1)
// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i
// --------------------------------------------
// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i
//
// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
//
// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
// = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
//
// Or, using C notation for bit operations:
//
// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
//
// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
// notation, this is:
//
// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
//
// Observe that, while t_i may be positive or negative, a_i is bounded by
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
// all groups. That would imply the subsequent a_i is zero, which means all
// terms thus far were zero.)
//
// Returning to our doubling position, we have a_j = t_j (mod n). We now
// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
//
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
//
// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
// <= k/2^j + 2^w - t_j
// < n/2^w + 2^w + 2^(w-1)
//
// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
// addition may hit the doubling case.
//
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
// That is:
//
// - 2^w divides k_H
// - k_M is 0 or 2^(w-1)
// - 0 <= k_L < 2^(w-1)
//
// Divide n into n_H + n_M + n_L similarly. We thus have:
//
// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
// = k_L - k_M
//
// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
// = (k>>w) << w + ((k>>(w-1)) & 1) << w
// = k_H + 2*k_M
//
// n = a_0 - t_0
// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
// = k_H + 3*k_M - k_L
//
// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
// Then,
//
// n_M + n_L = 3*(2^(w-1)) - k_L
// > 3*(2^(w-1)) - 2^(w-1)
// = 2^w
//
// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
// k_H < n_H - 2*2^w. Then,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// < n_H - 2*2^w + 3*(2^(w-1)) - k_L
// n_M + n_L < -2^(w-1) - k_L
//
// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// = n_H - 2^w + 3*(2^(w-1)) - k_L
// n_M + n_L = 2^(w-1) - k_L
// <= 2^(w-1)
//
// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
//
// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
// doubling in the final addition and is the only such scalar.
//
// COMMON CURVES:
//
// The group orders for common curves end in the following bit patterns:
//
// P-521: ...00001001; w = 4 is okay
// P-384: ...01110011; w = 2, 5, 6, 7 are okay
// P-256: ...01010001; w = 5, 7 are okay
// P-224: ...00111101; w = 3, 4, 5, 6 are okay
static inline void booth_recode(crypto_word_t *is_negative, crypto_word_t *digit,
crypto_word_t in, crypto_word_t w) {
debug_assert_nonsecret(w >= 2);
debug_assert_nonsecret(w <= 7);
// Set all bits of `s` to MSB(in), similar to |constant_time_msb_s|,
// but 'in' seen as (`w+1`)-bit value.
crypto_word_t s = ~((in >> w) - 1);
crypto_word_t d;
d = ((crypto_word_t)1u << (w + 1)) - in - 1;
d = (d & s) | (in & ~s);
d = (d >> 1) + (d & 1);
*is_negative = constant_time_is_nonzero_w(s & 1);
*digit = d;
}
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
void little_endian_bytes_from_scalar(uint8_t str[], size_t str_len,
const Limb scalar[],
size_t num_limbs);
#endif // OPENSSL_HEADER_EC_ECP_NISTZ_H

View File

@@ -0,0 +1,34 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#ifndef OPENSSL_HEADER_EC_ECP_NISTZ384_H
#define OPENSSL_HEADER_EC_ECP_NISTZ384_H
#include "../../limbs/limbs.h"
#define P384_LIMBS (384u / LIMB_BITS)
typedef struct {
Limb X[P384_LIMBS];
Limb Y[P384_LIMBS];
Limb Z[P384_LIMBS];
} P384_POINT;
typedef struct {
Limb X[P384_LIMBS];
Limb Y[P384_LIMBS];
} P384_POINT_AFFINE;
#endif // OPENSSL_HEADER_EC_ECP_NISTZ384_H

View File

@@ -0,0 +1,300 @@
/* Copyright (c) 2014, Intel Corporation.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
/* Developers and authors:
* Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center
* (2) University of Haifa
* Reference:
* Shay Gueron and Vlad Krasnov
* "Fast Prime Field Elliptic Curve Cryptography with 256 Bit Primes"
* http://eprint.iacr.org/2013/816 */
#include "ecp_nistz.h"
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
/* Point double: r = 2*a */
static void nistz384_point_double(P384_POINT *r, const P384_POINT *a) {
BN_ULONG S[P384_LIMBS];
BN_ULONG M[P384_LIMBS];
BN_ULONG Zsqr[P384_LIMBS];
BN_ULONG tmp0[P384_LIMBS];
const BN_ULONG *in_x = a->X;
const BN_ULONG *in_y = a->Y;
const BN_ULONG *in_z = a->Z;
BN_ULONG *res_x = r->X;
BN_ULONG *res_y = r->Y;
BN_ULONG *res_z = r->Z;
elem_mul_by_2(S, in_y);
elem_sqr_mont(Zsqr, in_z);
elem_sqr_mont(S, S);
elem_mul_mont(res_z, in_z, in_y);
elem_mul_by_2(res_z, res_z);
elem_add(M, in_x, Zsqr);
elem_sub(Zsqr, in_x, Zsqr);
elem_sqr_mont(res_y, S);
elem_div_by_2(res_y, res_y);
elem_mul_mont(M, M, Zsqr);
elem_mul_by_3(M, M);
elem_mul_mont(S, S, in_x);
elem_mul_by_2(tmp0, S);
elem_sqr_mont(res_x, M);
elem_sub(res_x, res_x, tmp0);
elem_sub(S, S, res_x);
elem_mul_mont(S, S, M);
elem_sub(res_y, S, res_y);
}
/* Point addition: r = a+b */
static void nistz384_point_add(P384_POINT *r, const P384_POINT *a,
const P384_POINT *b) {
BN_ULONG U2[P384_LIMBS], S2[P384_LIMBS];
BN_ULONG U1[P384_LIMBS], S1[P384_LIMBS];
BN_ULONG Z1sqr[P384_LIMBS];
BN_ULONG Z2sqr[P384_LIMBS];
BN_ULONG H[P384_LIMBS], R[P384_LIMBS];
BN_ULONG Hsqr[P384_LIMBS];
BN_ULONG Rsqr[P384_LIMBS];
BN_ULONG Hcub[P384_LIMBS];
BN_ULONG res_x[P384_LIMBS];
BN_ULONG res_y[P384_LIMBS];
BN_ULONG res_z[P384_LIMBS];
const BN_ULONG *in1_x = a->X;
const BN_ULONG *in1_y = a->Y;
const BN_ULONG *in1_z = a->Z;
const BN_ULONG *in2_x = b->X;
const BN_ULONG *in2_y = b->Y;
const BN_ULONG *in2_z = b->Z;
BN_ULONG in1infty = is_zero(a->Z);
BN_ULONG in2infty = is_zero(b->Z);
elem_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
elem_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
elem_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
elem_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
elem_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
elem_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
elem_sub(R, S2, S1); /* R = S2 - S1 */
elem_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
elem_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
elem_sub(H, U2, U1); /* H = U2 - U1 */
BN_ULONG is_exceptional = is_equal(U1, U2) & ~in1infty & ~in2infty;
if (is_exceptional) {
if (is_equal(S1, S2)) {
nistz384_point_double(r, a);
} else {
limbs_zero(r->X, P384_LIMBS);
limbs_zero(r->Y, P384_LIMBS);
limbs_zero(r->Z, P384_LIMBS);
}
return;
}
elem_sqr_mont(Rsqr, R); /* R^2 */
elem_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
elem_sqr_mont(Hsqr, H); /* H^2 */
elem_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
elem_mul_mont(Hcub, Hsqr, H); /* H^3 */
elem_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
elem_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
elem_sub(res_x, Rsqr, Hsqr);
elem_sub(res_x, res_x, Hcub);
elem_sub(res_y, U2, res_x);
elem_mul_mont(S2, S1, Hcub);
elem_mul_mont(res_y, R, res_y);
elem_sub(res_y, res_y, S2);
copy_conditional(res_x, in2_x, in1infty);
copy_conditional(res_y, in2_y, in1infty);
copy_conditional(res_z, in2_z, in1infty);
copy_conditional(res_x, in1_x, in2infty);
copy_conditional(res_y, in1_y, in2infty);
copy_conditional(res_z, in1_z, in2infty);
limbs_copy(r->X, res_x, P384_LIMBS);
limbs_copy(r->Y, res_y, P384_LIMBS);
limbs_copy(r->Z, res_z, P384_LIMBS);
}
static void add_precomputed_w5(P384_POINT *r, crypto_word_t wvalue,
const P384_POINT table[16]) {
crypto_word_t recoded_is_negative;
crypto_word_t recoded;
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
alignas(64) P384_POINT h;
p384_point_select_w5(&h, table, recoded);
alignas(64) BN_ULONG tmp[P384_LIMBS];
p384_elem_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, recoded_is_negative);
nistz384_point_add(r, r, &h);
}
/* r = p * p_scalar */
static void nistz384_point_mul(P384_POINT *r,
const BN_ULONG p_scalar[P384_LIMBS],
const Limb p_x[P384_LIMBS],
const Limb p_y[P384_LIMBS]) {
static const size_t kWindowSize = 5;
static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
uint8_t p_str[(P384_LIMBS * sizeof(Limb)) + 1];
little_endian_bytes_from_scalar(p_str, sizeof(p_str) / sizeof(p_str[0]),
p_scalar, P384_LIMBS);
/* A |P384_POINT| is (3 * 48) = 144 bytes, and the 64-byte alignment should
* add no more than 63 bytes of overhead. Thus, |table| should require
* ~2367 ((144 * 16) + 63) bytes of stack space. */
alignas(64) P384_POINT table[16];
/* table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
* not stored. All other values are actually stored with an offset of -1 in
* table. */
P384_POINT *row = table;
limbs_copy(row[1 - 1].X, p_x, P384_LIMBS);
limbs_copy(row[1 - 1].Y, p_y, P384_LIMBS);
limbs_copy(row[1 - 1].Z, ONE, P384_LIMBS);
nistz384_point_double(&row[2 - 1], &row[1 - 1]);
nistz384_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
nistz384_point_double(&row[4 - 1], &row[2 - 1]);
nistz384_point_double(&row[6 - 1], &row[3 - 1]);
nistz384_point_double(&row[8 - 1], &row[4 - 1]);
nistz384_point_double(&row[12 - 1], &row[6 - 1]);
nistz384_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
nistz384_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
nistz384_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
nistz384_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
nistz384_point_double(&row[14 - 1], &row[7 - 1]);
nistz384_point_double(&row[10 - 1], &row[5 - 1]);
nistz384_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
nistz384_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
nistz384_point_double(&row[16 - 1], &row[8 - 1]);
static const size_t START_INDEX = 384 - 4;
size_t index = START_INDEX;
BN_ULONG recoded_is_negative;
crypto_word_t recoded;
crypto_word_t wvalue = p_str[(index - 1) / 8];
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
booth_recode(&recoded_is_negative, &recoded, wvalue, 5);
dev_assert_secret(!recoded_is_negative);
p384_point_select_w5(r, table, recoded);
while (index >= kWindowSize) {
if (index != START_INDEX) {
size_t off = (index - 1) / 8;
wvalue = p_str[off] | p_str[off + 1] << 8;
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
add_precomputed_w5(r, wvalue, table);
}
index -= kWindowSize;
nistz384_point_double(r, r);
nistz384_point_double(r, r);
nistz384_point_double(r, r);
nistz384_point_double(r, r);
nistz384_point_double(r, r);
}
/* Final window */
wvalue = p_str[0];
wvalue = (wvalue << 1) & kMask;
add_precomputed_w5(r, wvalue, table);
}
void p384_point_double(Limb r[3][P384_LIMBS], const Limb a[3][P384_LIMBS])
{
P384_POINT t;
limbs_copy(t.X, a[0], P384_LIMBS);
limbs_copy(t.Y, a[1], P384_LIMBS);
limbs_copy(t.Z, a[2], P384_LIMBS);
nistz384_point_double(&t, &t);
limbs_copy(r[0], t.X, P384_LIMBS);
limbs_copy(r[1], t.Y, P384_LIMBS);
limbs_copy(r[2], t.Z, P384_LIMBS);
}
void p384_point_add(Limb r[3][P384_LIMBS],
const Limb a[3][P384_LIMBS],
const Limb b[3][P384_LIMBS])
{
P384_POINT t1;
limbs_copy(t1.X, a[0], P384_LIMBS);
limbs_copy(t1.Y, a[1], P384_LIMBS);
limbs_copy(t1.Z, a[2], P384_LIMBS);
P384_POINT t2;
limbs_copy(t2.X, b[0], P384_LIMBS);
limbs_copy(t2.Y, b[1], P384_LIMBS);
limbs_copy(t2.Z, b[2], P384_LIMBS);
nistz384_point_add(&t1, &t1, &t2);
limbs_copy(r[0], t1.X, P384_LIMBS);
limbs_copy(r[1], t1.Y, P384_LIMBS);
limbs_copy(r[2], t1.Z, P384_LIMBS);
}
void p384_point_mul(Limb r[3][P384_LIMBS], const BN_ULONG p_scalar[P384_LIMBS],
const Limb p_x[P384_LIMBS], const Limb p_y[P384_LIMBS]) {
alignas(64) P384_POINT acc;
nistz384_point_mul(&acc, p_scalar, p_x, p_y);
limbs_copy(r[0], acc.X, P384_LIMBS);
limbs_copy(r[1], acc.Y, P384_LIMBS);
limbs_copy(r[2], acc.Z, P384_LIMBS);
}
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif

View File

@@ -0,0 +1,54 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "./p256_shared.h"
#include "../../limbs/limbs.h"
#if !defined(OPENSSL_USE_NISTZ256)
typedef Limb ScalarMont[P256_LIMBS];
typedef Limb Scalar[P256_LIMBS];
#include "../bn/internal.h"
static const BN_ULONG N[P256_LIMBS] = {
#if defined(OPENSSL_64_BIT)
0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
#else
0xfc632551, 0xf3b9cac2, 0xa7179e84, 0xbce6faad, 0xffffffff, 0xffffffff, 0,
0xffffffff
#endif
};
static const BN_ULONG N_N0[] = {
BN_MONT_CTX_N0(0xccd1c8aa, 0xee00bc4f)
};
void p256_scalar_mul_mont(ScalarMont r, const ScalarMont a,
const ScalarMont b) {
/* XXX: Inefficient. TODO: optimize with dedicated multiplication routine. */
bn_mul_mont_small(r, a, b, N, N_N0, P256_LIMBS);
}
/* XXX: Inefficient. TODO: optimize with dedicated squaring routine. */
void p256_scalar_sqr_rep_mont(ScalarMont r, const ScalarMont a, Limb rep) {
dev_assert_secret(rep >= 1);
p256_scalar_mul_mont(r, a, a);
for (Limb i = 1; i < rep; ++i) {
p256_scalar_mul_mont(r, r, r);
}
}
#endif

View File

@@ -0,0 +1,246 @@
/* Copyright 2016 Brian Smith.
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "../../limbs/limbs.h"
#include "ecp_nistz384.h"
#include "../bn/internal.h"
#include "../../internal.h"
#include "../../limbs/limbs.inl"
/* XXX: Here we assume that the conversion from |Carry| to |Limb| is
* constant-time, but we haven't verified that assumption. TODO: Fix it so
* we don't need to make that assumption. */
typedef Limb Elem[P384_LIMBS];
typedef Limb ScalarMont[P384_LIMBS];
typedef Limb Scalar[P384_LIMBS];
static const BN_ULONG Q[P384_LIMBS] = {
#if defined(OPENSSL_64_BIT)
0xffffffff, 0xffffffff00000000, 0xfffffffffffffffe, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff
#else
0xffffffff, 0, 0, 0xffffffff, 0xfffffffe, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
#endif
};
static const BN_ULONG N[P384_LIMBS] = {
#if defined(OPENSSL_64_BIT)
0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, 0xffffffffffffffff,
0xffffffffffffffff, 0xffffffffffffffff
#else
0xccc52973, 0xecec196a, 0x48b0a77a, 0x581a0db2, 0xf4372ddf, 0xc7634d81,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
#endif
};
static const BN_ULONG ONE[P384_LIMBS] = {
#if defined(OPENSSL_64_BIT)
0xffffffff00000001, 0xffffffff, 1, 0, 0
#else
1, 0xffffffff, 0xffffffff, 0, 1, 0, 0, 0, 0, 0
#endif
};
static const Elem Q_PLUS_1_SHR_1 = {
#if defined(OPENSSL_64_BIT)
0x80000000, 0x7fffffff80000000, 0xffffffffffffffff, 0xffffffffffffffff,
0xffffffffffffffff, 0x7fffffffffffffff
#else
0x80000000, 0, 0x80000000, 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x7fffffff
#endif
};
static const BN_ULONG Q_N0[] = {
BN_MONT_CTX_N0(1, 1)
};
static const BN_ULONG N_N0[] = {
BN_MONT_CTX_N0(0x6ed46089, 0xe88fdc45)
};
/* XXX: MSVC for x86 warns when it fails to inline these functions it should
* probably inline. */
#if defined(_MSC_VER) && !defined(__clang__) && defined(OPENSSL_X86)
#define INLINE_IF_POSSIBLE __forceinline
#else
#define INLINE_IF_POSSIBLE inline
#endif
static inline Limb is_equal(const Elem a, const Elem b) {
return LIMBS_equal(a, b, P384_LIMBS);
}
static inline Limb is_zero(const BN_ULONG a[P384_LIMBS]) {
return LIMBS_are_zero(a, P384_LIMBS);
}
static inline void copy_conditional(Elem r, const Elem a,
const Limb condition) {
for (size_t i = 0; i < P384_LIMBS; ++i) {
r[i] = constant_time_select_w(condition, a[i], r[i]);
}
}
static inline void elem_add(Elem r, const Elem a, const Elem b) {
LIMBS_add_mod(r, a, b, Q, P384_LIMBS);
}
static inline void elem_sub(Elem r, const Elem a, const Elem b) {
LIMBS_sub_mod(r, a, b, Q, P384_LIMBS);
}
static void elem_div_by_2(Elem r, const Elem a) {
/* Consider the case where `a` is even. Then we can shift `a` right one bit
* and the result will still be valid because we didn't lose any bits and so
* `(a >> 1) * 2 == a (mod q)`, which is the invariant we must satisfy.
*
* The remainder of this comment is considering the case where `a` is odd.
*
* Since `a` is odd, it isn't the case that `(a >> 1) * 2 == a (mod q)`
* because the lowest bit is lost during the shift. For example, consider:
*
* ```python
* q = 2**384 - 2**128 - 2**96 + 2**32 - 1
* a = 2**383
* two_a = a * 2 % q
* assert two_a == 0x100000000ffffffffffffffff00000001
* ```
*
* Notice there how `(2 * a) % q` wrapped around to a smaller odd value. When
* we divide `two_a` by two (mod q), we need to get the value `2**383`, which
* we obviously can't get with just a right shift.
*
* `q` is odd, and `a` is odd, so `a + q` is even. We could calculate
* `(a + q) >> 1` and then reduce it mod `q`. However, then we would have to
* keep track of an extra most significant bit. We can avoid that by instead
* calculating `(a >> 1) + ((q + 1) >> 1)`. The `1` in `q + 1` is the least
* significant bit of `a`. `q + 1` is even, which means it can be shifted
* without losing any bits. Since `q` is odd, `q - 1` is even, so the largest
* odd field element is `q - 2`. Thus we know that `a <= q - 2`. We know
* `(q + 1) >> 1` is `(q + 1) / 2` since (`q + 1`) is even. The value of
* `a >> 1` is `(a - 1)/2` since the shift will drop the least significant
* bit of `a`, which is 1. Thus:
*
* sum = ((q + 1) >> 1) + (a >> 1)
* sum = (q + 1)/2 + (a >> 1) (substituting (q + 1)/2)
* <= (q + 1)/2 + (q - 2 - 1)/2 (substituting a <= q - 2)
* <= (q + 1)/2 + (q - 3)/2 (simplifying)
* <= (q + 1 + q - 3)/2 (factoring out the common divisor)
* <= (2q - 2)/2 (simplifying)
* <= q - 1 (simplifying)
*
* Thus, no reduction of the sum mod `q` is necessary. */
Limb is_odd = constant_time_is_nonzero_w(a[0] & 1);
/* r = a >> 1. */
Limb carry = a[P384_LIMBS - 1] & 1;
r[P384_LIMBS - 1] = a[P384_LIMBS - 1] >> 1;
for (size_t i = 1; i < P384_LIMBS; ++i) {
Limb new_carry = a[P384_LIMBS - i - 1];
r[P384_LIMBS - i - 1] =
(a[P384_LIMBS - i - 1] >> 1) | (carry << (LIMB_BITS - 1));
carry = new_carry;
}
Elem adjusted;
BN_ULONG carry2 = limbs_add(adjusted, r, Q_PLUS_1_SHR_1, P384_LIMBS);
dev_assert_secret(carry2 == 0);
(void)carry2;
copy_conditional(r, adjusted, is_odd);
}
static inline void elem_mul_mont(Elem r, const Elem a, const Elem b) {
/* XXX: Not (clearly) constant-time; inefficient.*/
bn_mul_mont_small(r, a, b, Q, Q_N0, P384_LIMBS);
}
static inline void elem_mul_by_2(Elem r, const Elem a) {
LIMBS_shl_mod(r, a, Q, P384_LIMBS);
}
static INLINE_IF_POSSIBLE void elem_mul_by_3(Elem r, const Elem a) {
/* XXX: inefficient. TODO: Replace with an integrated shift + add. */
Elem doubled;
elem_add(doubled, a, a);
elem_add(r, doubled, a);
}
static inline void elem_sqr_mont(Elem r, const Elem a) {
/* XXX: Inefficient. TODO: Add a dedicated squaring routine. */
elem_mul_mont(r, a, a);
}
void p384_elem_sub(Elem r, const Elem a, const Elem b) {
elem_sub(r, a, b);
}
void p384_elem_div_by_2(Elem r, const Elem a) {
elem_div_by_2(r, a);
}
void p384_elem_mul_mont(Elem r, const Elem a, const Elem b) {
elem_mul_mont(r, a, b);
}
void p384_elem_neg(Elem r, const Elem a) {
Limb is_zero = LIMBS_are_zero(a, P384_LIMBS);
Carry borrow = limbs_sub(r, Q, a, P384_LIMBS);
dev_assert_secret(borrow == 0);
(void)borrow;
for (size_t i = 0; i < P384_LIMBS; ++i) {
r[i] = constant_time_select_w(is_zero, 0, r[i]);
}
}
void p384_scalar_mul_mont(ScalarMont r, const ScalarMont a,
const ScalarMont b) {
/* XXX: Inefficient. TODO: Add dedicated multiplication routine. */
bn_mul_mont_small(r, a, b, N, N_N0, P384_LIMBS);
}
/* TODO(perf): Optimize this. */
static void p384_point_select_w5(P384_POINT *out,
const P384_POINT table[16], size_t index) {
Elem x; limbs_zero(x, P384_LIMBS);
Elem y; limbs_zero(y, P384_LIMBS);
Elem z; limbs_zero(z, P384_LIMBS);
// TODO: Rewrite in terms of |limbs_select|.
for (size_t i = 0; i < 16; ++i) {
crypto_word_t equal = constant_time_eq_w(index, (crypto_word_t)i + 1);
for (size_t j = 0; j < P384_LIMBS; ++j) {
x[j] = constant_time_select_w(equal, table[i].X[j], x[j]);
y[j] = constant_time_select_w(equal, table[i].Y[j], y[j]);
z[j] = constant_time_select_w(equal, table[i].Z[j], z[j]);
}
}
limbs_copy(out->X, x, P384_LIMBS);
limbs_copy(out->Y, y, P384_LIMBS);
limbs_copy(out->Z, z, P384_LIMBS);
}
#include "ecp_nistz384.inl"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,437 @@
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
// (2) University of Haifa, Israel
//
// Reference:
// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
// 256 Bit Primes"
#include <ring-core/base.h>
#include "../../limbs/limbs.inl"
#include <stdint.h>
#include "p256-nistz.h"
#if defined(OPENSSL_USE_NISTZ256)
typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
// One converted into the Montgomery domain
static const BN_ULONG ONE_MONT[P256_LIMBS] = {
TOBN(0x00000000, 0x00000001),
TOBN(0xffffffff, 0x00000000),
TOBN(0xffffffff, 0xffffffff),
TOBN(0x00000000, 0xfffffffe),
};
// Precomputed tables for the default generator
#include "p256-nistz-table.h"
// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in
// util.c for details
static crypto_word_t booth_recode_w5(crypto_word_t in) {
crypto_word_t s, d;
s = ~((in >> 5) - 1);
d = (1 << 6) - in - 1;
d = (d & s) | (in & ~s);
d = (d >> 1) + (d & 1);
return (d << 1) + (s & 1);
}
static crypto_word_t booth_recode_w7(crypto_word_t in) {
crypto_word_t s, d;
s = ~((in >> 7) - 1);
d = (1 << 8) - in - 1;
d = (d & s) | (in & ~s);
d = (d >> 1) + (d & 1);
return (d << 1) + (s & 1);
}
// The `(P256_LIMBS == 8)` case is unreachable for 64-bit targets.
#if defined(OPENSSL_64_BIT) && defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunreachable-code"
#endif
// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is
// if |move| is zero.
//
// WARNING: this breaks the usual convention of constant-time functions
// returning masks.
static void copy_conditional(BN_ULONG dst[P256_LIMBS],
const BN_ULONG src[P256_LIMBS], BN_ULONG move) {
BN_ULONG mask1 = ((BN_ULONG)0) - move;
BN_ULONG mask2 = ~mask1;
dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
if (P256_LIMBS == 8) {
dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
}
}
#if defined(__clang__)
#pragma GCC diagnostic pop
#endif
// is_not_zero returns one iff in != 0 and zero otherwise.
//
// WARNING: this breaks the usual convention of constant-time functions
// returning masks.
//
// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64)
// (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f)
// )
//
// (declare-fun x () (_ BitVec 64))
//
// (assert (and (= x #x0000000000000000) (= (is_not_zero x)
// #x0000000000000001))) (check-sat)
//
// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x)
// #x0000000000000000))) (check-sat)
//
static BN_ULONG is_not_zero(BN_ULONG in) {
in |= (0 - in);
in >>= BN_BITS2 - 1;
return in;
}
#if defined(OPENSSL_X86_64)
// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in
// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both
// capabilities.
void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]) {
if (adx_bmi2_available) {
ecp_nistz256_mul_mont_adx(res, a, b);
} else {
ecp_nistz256_mul_mont_nohw(res, a, b);
}
}
void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]) {
if (adx_bmi2_available) {
ecp_nistz256_sqr_mont_adx(res, a);
} else {
ecp_nistz256_sqr_mont_nohw(res, a);
}
}
void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]) {
if (adx_bmi2_available) {
ecp_nistz256_ord_mul_mont_adx(res, a, b);
} else {
ecp_nistz256_ord_mul_mont_nohw(res, a, b);
}
}
void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
BN_ULONG rep) {
if (adx_bmi2_available) {
ecp_nistz256_ord_sqr_mont_adx(res, a, rep);
} else {
ecp_nistz256_ord_sqr_mont_nohw(res, a, rep);
}
}
static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
int index) {
if (avx2_available) {
ecp_nistz256_select_w5_avx2(val, in_t, index);
} else {
ecp_nistz256_select_w5_nohw(val, in_t, index);
}
}
static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
const P256_POINT_AFFINE in_t[64],
int index) {
if (avx2_available) {
ecp_nistz256_select_w7_avx2(val, in_t, index);
} else {
ecp_nistz256_select_w7_nohw(val, in_t, index);
}
}
void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) {
if (adx_bmi2_available) {
ecp_nistz256_point_double_adx(r, a);
} else {
ecp_nistz256_point_double_nohw(r, a);
}
}
void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b) {
if (adx_bmi2_available) {
ecp_nistz256_point_add_adx(r, a, b);
} else {
ecp_nistz256_point_add_nohw(r, a, b);
}
}
void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b) {
if (adx_bmi2_available) {
ecp_nistz256_point_add_affine_adx(r, a, b);
} else {
ecp_nistz256_point_add_affine_nohw(r, a, b);
}
}
#endif // OPENSSL_X86_64
// r = p * p_scalar
static void ecp_nistz256_windowed_mul(P256_POINT *r,
const BN_ULONG p_scalar[P256_LIMBS],
const BN_ULONG p_x[P256_LIMBS],
const BN_ULONG p_y[P256_LIMBS]) {
debug_assert_nonsecret(r != NULL);
debug_assert_nonsecret(p_scalar != NULL);
debug_assert_nonsecret(p_x != NULL);
debug_assert_nonsecret(p_y != NULL);
static const size_t kWindowSize = 5;
static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1;
// A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should
// add no more than 63 bytes of overhead. Thus, |table| should require
// ~1599 ((96 * 16) + 63) bytes of stack space.
alignas(64) P256_POINT table[16];
P256_SCALAR_BYTES p_str;
p256_scalar_bytes_from_limbs(p_str, p_scalar);
// table[0] is implicitly (0,0,0) (the point at infinity), therefore it is
// not stored. All other values are actually stored with an offset of -1 in
// table.
P256_POINT *row = table;
limbs_copy(row[1 - 1].X, p_x, P256_LIMBS);
limbs_copy(row[1 - 1].Y, p_y, P256_LIMBS);
limbs_copy(row[1 - 1].Z, ONE_MONT, P256_LIMBS);
ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]);
ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]);
ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]);
ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]);
ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]);
ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]);
ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]);
ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]);
ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]);
ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]);
ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]);
ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]);
ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]);
ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]);
ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]);
BN_ULONG tmp[P256_LIMBS];
alignas(32) P256_POINT h;
size_t index = 255;
crypto_word_t wvalue = p_str[(index - 1) / 8];
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
ecp_nistz256_select_w5(r, table, (int)(booth_recode_w5(wvalue) >> 1));
while (index >= 5) {
if (index != 255) {
size_t off = (index - 1) / 8;
wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
wvalue = (wvalue >> ((index - 1) % 8)) & kMask;
wvalue = booth_recode_w5(wvalue);
ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));
ecp_nistz256_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, (wvalue & 1));
ecp_nistz256_point_add(r, r, &h);
}
index -= kWindowSize;
ecp_nistz256_point_double(r, r);
ecp_nistz256_point_double(r, r);
ecp_nistz256_point_double(r, r);
ecp_nistz256_point_double(r, r);
ecp_nistz256_point_double(r, r);
}
// Final window
wvalue = p_str[0];
wvalue = (wvalue << 1) & kMask;
wvalue = booth_recode_w5(wvalue);
ecp_nistz256_select_w5(&h, table, (int)(wvalue >> 1));
ecp_nistz256_neg(tmp, h.Y);
copy_conditional(h.Y, tmp, wvalue & 1);
ecp_nistz256_point_add(r, r, &h);
}
static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) {
static const size_t kWindowSize = 7;
static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
*index = kWindowSize;
crypto_word_t wvalue = ((crypto_word_t)p_str[0] << 1) & kMask;
return booth_recode_w7(wvalue);
}
static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
static const size_t kWindowSize = 7;
static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1;
const size_t off = (*index - 1) / 8;
crypto_word_t wvalue =
(crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8;
wvalue = (wvalue >> ((*index - 1) % 8)) & kMask;
*index += kWindowSize;
return booth_recode_w7(wvalue);
}
void p256_point_mul(Limb r[3][P256_LIMBS], const Limb p_scalar[P256_LIMBS],
const Limb p_x[P256_LIMBS],
const Limb p_y[P256_LIMBS]) {
alignas(32) P256_POINT out;
ecp_nistz256_windowed_mul(&out, p_scalar, p_x, p_y);
limbs_copy(r[0], out.X, P256_LIMBS);
limbs_copy(r[1], out.Y, P256_LIMBS);
limbs_copy(r[2], out.Z, P256_LIMBS);
}
void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
P256_SCALAR_BYTES p_str;
p256_scalar_bytes_from_limbs(p_str, scalar);
// First window
size_t index = 0;
crypto_word_t wvalue = calc_first_wvalue(&index, p_str);
alignas(32) P256_POINT_AFFINE t;
alignas(32) P256_POINT p;
ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], (int)(wvalue >> 1));
ecp_nistz256_neg(p.Z, t.Y);
copy_conditional(t.Y, p.Z, wvalue & 1);
// Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t|
// is infinity and |ONE| otherwise. |t| was computed from the table, so it
// is infinity iff |wvalue >> 1| is zero.
limbs_copy(p.X, t.X, P256_LIMBS);
limbs_copy(p.Y, t.Y, P256_LIMBS);
limbs_zero(p.Z, P256_LIMBS);
copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1));
for (int i = 1; i < 37; i++) {
wvalue = calc_wvalue(&index, p_str);
ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], (int)(wvalue >> 1));
alignas(32) BN_ULONG neg_Y[P256_LIMBS];
ecp_nistz256_neg(neg_Y, t.Y);
copy_conditional(t.Y, neg_Y, wvalue & 1);
// Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the
// same non-infinity point.
ecp_nistz256_point_add_affine(&p, &p, &t);
}
limbs_copy(r[0], p.X, P256_LIMBS);
limbs_copy(r[1], p.Y, P256_LIMBS);
limbs_copy(r[2], p.Z, P256_LIMBS);
}
void p256_point_mul_base_vartime(Limb r[3][P256_LIMBS],
const Limb g_scalar[P256_LIMBS]) {
alignas(32) P256_POINT p;
uint8_t p_str[33];
OPENSSL_memcpy(p_str, g_scalar, 32);
p_str[32] = 0;
// First window
size_t index = 0;
size_t wvalue = calc_first_wvalue(&index, p_str);
// Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p|
// is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so
// it is infinity iff |wvalue >> 1| is zero.
if ((wvalue >> 1) != 0) {
OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X,
sizeof(p.X));
OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y,
sizeof(p.Y));
OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z));
} else {
OPENSSL_memset(p.X, 0, sizeof(p.X));
OPENSSL_memset(p.Y, 0, sizeof(p.Y));
OPENSSL_memset(p.Z, 0, sizeof(p.Z));
}
if ((wvalue & 1) == 1) {
ecp_nistz256_neg(p.Y, p.Y);
}
for (int i = 1; i < 37; i++) {
wvalue = calc_wvalue(&index, p_str);
if ((wvalue >> 1) == 0) {
continue;
}
alignas(32) P256_POINT_AFFINE t;
OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1],
sizeof(t));
if ((wvalue & 1) == 1) {
ecp_nistz256_neg(t.Y, t.Y);
}
// Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are
// the same non-infinity point, so it is important that we compute the
// |g_scalar| term before the |p_scalar| term.
ecp_nistz256_point_add_affine(&p, &p, &t);
}
limbs_copy(r[0], p.X, P256_LIMBS);
limbs_copy(r[1], p.Y, P256_LIMBS);
limbs_copy(r[2], p.Z, P256_LIMBS);
}
#endif /* defined(OPENSSL_USE_NISTZ256) */

View File

@@ -0,0 +1,171 @@
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
// (2) University of Haifa, Israel
//
// Reference:
// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
// 256 Bit Primes"
#ifndef OPENSSL_HEADER_EC_P256_X86_64_H
#define OPENSSL_HEADER_EC_P256_X86_64_H
#include <ring-core/base.h>
#include "p256_shared.h"
#include "../bn/internal.h"
#if defined(OPENSSL_USE_NISTZ256)
// ecp_nistz256_neg sets |res| to -|a| mod P.
void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
// ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
#else
void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
#endif
// ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
#else
void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS]);
#endif
// P-256 scalar operations.
//
// The following functions compute modulo N, where N is the order of P-256. They
// take fully-reduced inputs and give fully-reduced outputs.
// ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs
// are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
#else
void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS],
const BN_ULONG b[P256_LIMBS]);
#endif
// ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and
// outputs are in Montgomery form. That is, |res| is
// (|a| * 2^-256)^(2*|rep|) * 2^256 mod N.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
#else
void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
const BN_ULONG a[P256_LIMBS], BN_ULONG rep);
#endif
// P-256 point operations.
//
// The following functions may be used in-place. All coordinates are in the
// Montgomery domain.
// A P256_POINT_AFFINE represents a P-256 point in affine coordinates. Infinity
// is encoded as (0, 0).
typedef struct {
BN_ULONG X[P256_LIMBS];
BN_ULONG Y[P256_LIMBS];
} P256_POINT_AFFINE;
// ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16
// and all zeros (the point at infinity) if |index| is 0. This is done in
// constant time.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16],
int index);
void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16],
int index);
#else
void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16],
int index);
#endif
// ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64
// and all zeros (the point at infinity) if |index| is 0. This is done in
// constant time.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val,
const P256_POINT_AFFINE in_t[64], int index);
void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val,
const P256_POINT_AFFINE in_t[64], int index);
#else
void ecp_nistz256_select_w7(P256_POINT_AFFINE *val,
const P256_POINT_AFFINE in_t[64], int index);
#endif
// ecp_nistz256_point_double sets |r| to |a| doubled.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a);
void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a);
#else
void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
#endif
// ecp_nistz256_point_add adds |a| to |b| and places the result in |r|.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b);
void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b);
#else
void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a,
const P256_POINT *b);
#endif
// ecp_nistz256_point_add_affine adds |a| to |b| and places the result in
// |r|. |a| and |b| must not represent the same point unless they are both
// infinity.
#if defined(OPENSSL_X86_64)
void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
#else
void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a,
const P256_POINT_AFFINE *b);
#endif
#endif /* defined(OPENSSL_USE_NISTZ256) */
#endif // OPENSSL_HEADER_EC_P256_X86_64_H

539
vendor/ring/crypto/fipsmodule/ec/p256.c vendored Normal file
View File

@@ -0,0 +1,539 @@
// Copyright 2020 The BoringSSL Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// An implementation of the NIST P-256 elliptic curve point multiplication.
// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by
// Fiat, which lives in //third_party/fiat.
#include <ring-core/base.h>
#include "../../limbs/limbs.h"
#include "../../limbs/limbs.inl"
#include "p256_shared.h"
#include "../../internal.h"
#include "./util.h"
#if !defined(OPENSSL_USE_NISTZ256)
#if defined(_MSC_VER) && !defined(__clang__)
// '=': conversion from 'int64_t' to 'int32_t', possible loss of data
#pragma warning(disable: 4242)
// '=': conversion from 'int32_t' to 'uint8_t', possible loss of data
#pragma warning(disable: 4244)
// 'initializing': conversion from 'size_t' to 'fiat_p256_limb_t'
#pragma warning(disable: 4267)
#endif
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic ignored "-Winline"
#endif
#if defined(BORINGSSL_HAS_UINT128)
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include "../../../third_party/fiat/p256_64.h"
#elif defined(OPENSSL_64_BIT)
#include "../../../third_party/fiat/p256_64_msvc.h"
#else
#include "../../../third_party/fiat/p256_32.h"
#endif
// utility functions, handwritten
#if defined(OPENSSL_64_BIT)
#define FIAT_P256_NLIMBS 4
typedef uint64_t fiat_p256_limb_t;
typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS];
static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000,
0xffffffffffffffff, 0xfffffffe};
#else // 64BIT; else 32BIT
#define FIAT_P256_NLIMBS 8
typedef uint32_t fiat_p256_limb_t;
typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS];
static const fiat_p256_felem fiat_p256_one = {
0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0};
#endif // 64BIT
static fiat_p256_limb_t fiat_p256_nz(
const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
fiat_p256_limb_t ret;
fiat_p256_nonzero(&ret, in1);
return ret;
}
static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) {
for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) {
out[i] = in1[i];
}
}
static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS],
fiat_p256_limb_t t,
const fiat_p256_limb_t z[FIAT_P256_NLIMBS],
const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) {
fiat_p256_selectznz(out, !!t, z, nz);
}
static void fiat_p256_from_words(fiat_p256_felem out,
const Limb in[32 / sizeof(BN_ULONG)]) {
// Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on
// 64-bit platforms without |uint128_t|, they are different. However, on
// little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same
// layout.
OPENSSL_memcpy(out, in, 32);
}
static void fiat_p256_to_words(Limb out[32 / sizeof(BN_ULONG)], const fiat_p256_felem in) {
// See |fiat_p256_from_words|.
OPENSSL_memcpy(out, in, 32);
}
// Group operations
// ----------------
//
// Building on top of the field operations we have the operations on the
// elliptic curve group itself. Points on the curve are represented in Jacobian
// coordinates.
//
// Both operations were transcribed to Coq and proven to correspond to naive
// implementations using Affine coordinates, for all suitable fields. In the
// Coq proofs, issues of constant-time execution and memory layout (aliasing)
// conventions were not considered. Specification of affine coordinates:
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Spec/WeierstrassCurve.v#L28>
// As a sanity check, a proof that these points form a commutative group:
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/AffineProofs.v#L33>
// fiat_p256_point_double calculates 2*(x_in, y_in, z_in)
//
// The method is taken from:
// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
//
// Coq transcription and correctness proof:
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
//
// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
// while x_out == y_in is not (maybe this works, but it's not tested).
static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out,
fiat_p256_felem z_out,
const fiat_p256_felem x_in,
const fiat_p256_felem y_in,
const fiat_p256_felem z_in) {
fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
// delta = z^2
fiat_p256_square(delta, z_in);
// gamma = y^2
fiat_p256_square(gamma, y_in);
// beta = x*gamma
fiat_p256_mul(beta, x_in, gamma);
// alpha = 3*(x-delta)*(x+delta)
fiat_p256_sub(ftmp, x_in, delta);
fiat_p256_add(ftmp2, x_in, delta);
fiat_p256_add(tmptmp, ftmp2, ftmp2);
fiat_p256_add(ftmp2, ftmp2, tmptmp);
fiat_p256_mul(alpha, ftmp, ftmp2);
// x' = alpha^2 - 8*beta
fiat_p256_square(x_out, alpha);
fiat_p256_add(fourbeta, beta, beta);
fiat_p256_add(fourbeta, fourbeta, fourbeta);
fiat_p256_add(tmptmp, fourbeta, fourbeta);
fiat_p256_sub(x_out, x_out, tmptmp);
// z' = (y + z)^2 - gamma - delta
fiat_p256_add(delta, gamma, delta);
fiat_p256_add(ftmp, y_in, z_in);
fiat_p256_square(z_out, ftmp);
fiat_p256_sub(z_out, z_out, delta);
// y' = alpha*(4*beta - x') - 8*gamma^2
fiat_p256_sub(y_out, fourbeta, x_out);
fiat_p256_add(gamma, gamma, gamma);
fiat_p256_square(gamma, gamma);
fiat_p256_mul(y_out, alpha, y_out);
fiat_p256_add(gamma, gamma, gamma);
fiat_p256_sub(y_out, y_out, gamma);
}
// fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2)
//
// The method is taken from:
// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
//
// Coq transcription and correctness proof:
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L135>
// <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L205>
//
// This function includes a branch for checking whether the two input points
// are equal, (while not equal to the point at infinity). This case never
// happens during single point multiplication, so there is no timing leak for
// ECDH or ECDSA signing.
static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3,
fiat_p256_felem z3, const fiat_p256_felem x1,
const fiat_p256_felem y1,
const fiat_p256_felem z1, const int mixed,
const fiat_p256_felem x2,
const fiat_p256_felem y2,
const fiat_p256_felem z2) {
fiat_p256_felem x_out, y_out, z_out;
fiat_p256_limb_t z1nz = fiat_p256_nz(z1);
fiat_p256_limb_t z2nz = fiat_p256_nz(z2);
// z1z1 = z1z1 = z1**2
fiat_p256_felem z1z1;
fiat_p256_square(z1z1, z1);
fiat_p256_felem u1, s1, two_z1z2;
if (!mixed) {
// z2z2 = z2**2
fiat_p256_felem z2z2;
fiat_p256_square(z2z2, z2);
// u1 = x1*z2z2
fiat_p256_mul(u1, x1, z2z2);
// two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
fiat_p256_add(two_z1z2, z1, z2);
fiat_p256_square(two_z1z2, two_z1z2);
fiat_p256_sub(two_z1z2, two_z1z2, z1z1);
fiat_p256_sub(two_z1z2, two_z1z2, z2z2);
// s1 = y1 * z2**3
fiat_p256_mul(s1, z2, z2z2);
fiat_p256_mul(s1, s1, y1);
} else {
// We'll assume z2 = 1 (special case z2 = 0 is handled later).
// u1 = x1*z2z2
fiat_p256_copy(u1, x1);
// two_z1z2 = 2z1z2
fiat_p256_add(two_z1z2, z1, z1);
// s1 = y1 * z2**3
fiat_p256_copy(s1, y1);
}
// u2 = x2*z1z1
fiat_p256_felem u2;
fiat_p256_mul(u2, x2, z1z1);
// h = u2 - u1
fiat_p256_felem h;
fiat_p256_sub(h, u2, u1);
fiat_p256_limb_t xneq = fiat_p256_nz(h);
// z_out = two_z1z2 * h
fiat_p256_mul(z_out, h, two_z1z2);
// z1z1z1 = z1 * z1z1
fiat_p256_felem z1z1z1;
fiat_p256_mul(z1z1z1, z1, z1z1);
// s2 = y2 * z1**3
fiat_p256_felem s2;
fiat_p256_mul(s2, y2, z1z1z1);
// r = (s2 - s1)*2
fiat_p256_felem r;
fiat_p256_sub(r, s2, s1);
fiat_p256_add(r, r, r);
fiat_p256_limb_t yneq = fiat_p256_nz(r);
fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) &
~constant_time_is_zero_w(z1nz) &
~constant_time_is_zero_w(z2nz);
if (constant_time_declassify_w(is_nontrivial_double)) {
fiat_p256_point_double(x3, y3, z3, x1, y1, z1);
return;
}
// I = (2h)**2
fiat_p256_felem i;
fiat_p256_add(i, h, h);
fiat_p256_square(i, i);
// J = h * I
fiat_p256_felem j;
fiat_p256_mul(j, h, i);
// V = U1 * I
fiat_p256_felem v;
fiat_p256_mul(v, u1, i);
// x_out = r**2 - J - 2V
fiat_p256_square(x_out, r);
fiat_p256_sub(x_out, x_out, j);
fiat_p256_sub(x_out, x_out, v);
fiat_p256_sub(x_out, x_out, v);
// y_out = r(V-x_out) - 2 * s1 * J
fiat_p256_sub(y_out, v, x_out);
fiat_p256_mul(y_out, y_out, r);
fiat_p256_felem s1j;
fiat_p256_mul(s1j, s1, j);
fiat_p256_sub(y_out, y_out, s1j);
fiat_p256_sub(y_out, y_out, s1j);
fiat_p256_cmovznz(x_out, z1nz, x2, x_out);
fiat_p256_cmovznz(x3, z2nz, x1, x_out);
fiat_p256_cmovznz(y_out, z1nz, y2, y_out);
fiat_p256_cmovznz(y3, z2nz, y1, y_out);
fiat_p256_cmovznz(z_out, z1nz, z2, z_out);
fiat_p256_cmovznz(z3, z2nz, z1, z_out);
}
#include "./p256_table.h"
// fiat_p256_select_point_affine selects the |idx-1|th point from a
// precomputation table and copies it to out. If |idx| is zero, the output is
// the point at infinity.
static void fiat_p256_select_point_affine(
const fiat_p256_limb_t idx, size_t size,
const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) {
OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
for (size_t i = 0; i < size; i++) {
fiat_p256_limb_t mismatch = i ^ (idx - 1);
fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
}
fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one);
}
// fiat_p256_select_point selects the |idx|th point from a precomputation table
// and copies it to out.
static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size,
const fiat_p256_felem pre_comp[/*size*/][3],
fiat_p256_felem out[3]) {
OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3);
for (size_t i = 0; i < size; i++) {
fiat_p256_limb_t mismatch = i ^ idx;
fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]);
fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]);
fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]);
}
}
// fiat_p256_get_bit returns the |i|th bit in |in|
static crypto_word_t fiat_p256_get_bit(const Limb in[P256_LIMBS], int i) {
if (i < 0 || i >= 256) {
return 0;
}
#if defined(OPENSSL_64_BIT)
OPENSSL_STATIC_ASSERT(sizeof(Limb) == 8, "BN_ULONG was not 64-bit");
return (in[i >> 6] >> (i & 63)) & 1;
#else
OPENSSL_STATIC_ASSERT(sizeof(Limb) == 4, "BN_ULONG was not 32-bit");
return (in[i >> 5] >> (i & 31)) & 1;
#endif
}
void p256_point_mul(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS],
const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) {
debug_assert_nonsecret(r != NULL);
debug_assert_nonsecret(scalar != NULL);
debug_assert_nonsecret(p_x != NULL);
debug_assert_nonsecret(p_y != NULL);
fiat_p256_felem p_pre_comp[17][3];
OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp));
// Precompute multiples.
fiat_p256_from_words(p_pre_comp[1][0], p_x);
fiat_p256_from_words(p_pre_comp[1][1], p_y);
fiat_p256_copy(p_pre_comp[1][2], fiat_p256_one);
for (size_t j = 2; j <= 16; ++j) {
if (j & 1) {
fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2],
p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2],
0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1],
p_pre_comp[j - 1][2]);
} else {
fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1],
p_pre_comp[j][2], p_pre_comp[j / 2][0],
p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]);
}
}
// Set nq to the point at infinity.
fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3];
// Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round.
int skip = 1; // Save two point operations in the first round.
for (size_t i = 255; i < 256; i--) {
// double
if (!skip) {
fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
}
// do other additions every 5 doublings
if (i % 5 == 0) {
crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5;
bits |= fiat_p256_get_bit(scalar, i + 3) << 4;
bits |= fiat_p256_get_bit(scalar, i + 2) << 3;
bits |= fiat_p256_get_bit(scalar, i + 1) << 2;
bits |= fiat_p256_get_bit(scalar, i) << 1;
bits |= fiat_p256_get_bit(scalar, i - 1);
crypto_word_t sign, digit;
recode_scalar_bits(&sign, &digit, bits);
// select the point to add or subtract, in constant time.
fiat_p256_select_point((fiat_p256_limb_t)digit, 17,
RING_CORE_POINTLESS_ARRAY_CONST_CAST((const fiat_p256_felem(*)[3]))p_pre_comp,
tmp);
fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point.
fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp);
if (!skip) {
fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
0 /* mixed */, tmp[0], tmp[1], tmp[2]);
} else {
fiat_p256_copy(nq[0], tmp[0]);
fiat_p256_copy(nq[1], tmp[1]);
fiat_p256_copy(nq[2], tmp[2]);
skip = 0;
}
}
}
fiat_p256_to_words(r[0], nq[0]);
fiat_p256_to_words(r[1], nq[1]);
fiat_p256_to_words(r[2], nq[2]);
}
void p256_point_mul_base(Limb r[3][P256_LIMBS], const Limb scalar[P256_LIMBS]) {
// Set nq to the point at infinity.
fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3];
int skip = 1; // Save two point operations in the first round.
for (size_t i = 31; i < 32; i--) {
if (!skip) {
fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
}
// First, look 32 bits upwards.
crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3;
bits |= fiat_p256_get_bit(scalar, i + 160) << 2;
bits |= fiat_p256_get_bit(scalar, i + 96) << 1;
bits |= fiat_p256_get_bit(scalar, i + 32);
// Select the point to add, in constant time.
fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
fiat_p256_g_pre_comp[1], tmp);
if (!skip) {
fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2],
1 /* mixed */, tmp[0], tmp[1], tmp[2]);
} else {
fiat_p256_copy(nq[0], tmp[0]);
fiat_p256_copy(nq[1], tmp[1]);
fiat_p256_copy(nq[2], tmp[2]);
skip = 0;
}
// Second, look at the current position.
bits = fiat_p256_get_bit(scalar, i + 192) << 3;
bits |= fiat_p256_get_bit(scalar, i + 128) << 2;
bits |= fiat_p256_get_bit(scalar, i + 64) << 1;
bits |= fiat_p256_get_bit(scalar, i);
// Select the point to add, in constant time.
fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15,
fiat_p256_g_pre_comp[0], tmp);
fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */,
tmp[0], tmp[1], tmp[2]);
}
fiat_p256_to_words(r[0], nq[0]);
fiat_p256_to_words(r[1], nq[1]);
fiat_p256_to_words(r[2], nq[2]);
}
void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS],
const Limb b[P256_LIMBS]) {
fiat_p256_felem a_, b_;
fiat_p256_from_words(a_, a);
fiat_p256_from_words(b_, b);
fiat_p256_mul(a_, a_, b_);
fiat_p256_to_words(r, a_);
}
void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) {
fiat_p256_felem x;
fiat_p256_from_words(x, a);
fiat_p256_square(x, x);
fiat_p256_to_words(r, x);
}
void p256_point_add(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
const Limb b[3][P256_LIMBS]) {
fiat_p256_felem x1, y1, z1, x2, y2, z2;
fiat_p256_from_words(x1, a[0]);
fiat_p256_from_words(y1, a[1]);
fiat_p256_from_words(z1, a[2]);
fiat_p256_from_words(x2, b[0]);
fiat_p256_from_words(y2, b[1]);
fiat_p256_from_words(z2, b[2]);
fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2,
z2);
fiat_p256_to_words(r[0], x1);
fiat_p256_to_words(r[1], y1);
fiat_p256_to_words(r[2], z1);
}
void p256_point_double(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS]) {
fiat_p256_felem x, y, z;
fiat_p256_from_words(x, a[0]);
fiat_p256_from_words(y, a[1]);
fiat_p256_from_words(z, a[2]);
fiat_p256_point_double(x, y, z, x, y, z);
fiat_p256_to_words(r[0], x);
fiat_p256_to_words(r[1], y);
fiat_p256_to_words(r[2], z);
}
// For testing only.
void p256_point_add_affine(Limb r[3][P256_LIMBS], const Limb a[3][P256_LIMBS],
const Limb b[2][P256_LIMBS]) {
fiat_p256_felem x1, y1, z1, x2, y2;
fiat_p256_from_words(x1, a[0]);
fiat_p256_from_words(y1, a[1]);
fiat_p256_from_words(z1, a[2]);
fiat_p256_from_words(x2, b[0]);
fiat_p256_from_words(y2, b[1]);
fiat_p256_felem z2 = {0};
fiat_p256_cmovznz(z2, fiat_p256_nz(x2) & fiat_p256_nz(y2), z2, fiat_p256_one);
fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 1 /* mixed */, x2, y2, z2);
fiat_p256_to_words(r[0], x1);
fiat_p256_to_words(r[1], y1);
fiat_p256_to_words(r[2], z1);
}
#endif

View File

@@ -0,0 +1,62 @@
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2014, Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
// (2) University of Haifa, Israel
//
// Reference:
// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
// 256 Bit Primes"
#ifndef OPENSSL_HEADER_EC_P256_SHARED_H
#define OPENSSL_HEADER_EC_P256_SHARED_H
#include "ring-core/base.h"
#include "../bn/internal.h"
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
!defined(OPENSSL_SMALL)
# define OPENSSL_USE_NISTZ256
#endif
// P-256 field operations.
//
// An element mod P in P-256 is represented as a little-endian array of
// |P256_LIMBS| |BN_ULONG|s, spanning the full range of values.
//
// The following functions take fully-reduced inputs mod P and give
// fully-reduced outputs. They may be used in-place.
#define P256_LIMBS (256 / BN_BITS2)
// A P256_POINT represents a P-256 point in Jacobian coordinates.
typedef struct {
BN_ULONG X[P256_LIMBS];
BN_ULONG Y[P256_LIMBS];
BN_ULONG Z[P256_LIMBS];
} P256_POINT;
typedef unsigned char P256_SCALAR_BYTES[33];
static inline void p256_scalar_bytes_from_limbs(
P256_SCALAR_BYTES bytes_out, const BN_ULONG limbs[P256_LIMBS]) {
OPENSSL_memcpy(bytes_out, limbs, 32);
bytes_out[32] = 0;
}
#endif /* !defined(OPENSSL_USE_NISTZ256) */

View File

@@ -0,0 +1,297 @@
// Copyright 2020 The BoringSSL Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is generated by make_tables.go.
// Base point pre computation
// --------------------------
//
// Two different sorts of precomputed tables are used in the following code.
// Each contain various points on the curve, where each point is three field
// elements (x, y, z).
//
// For the base point table, z is usually 1 (0 for the point at infinity).
// This table has 2 * 16 elements, starting with the following:
// index | bits | point
// ------+---------+------------------------------
// 0 | 0 0 0 0 | 0G
// 1 | 0 0 0 1 | 1G
// 2 | 0 0 1 0 | 2^64G
// 3 | 0 0 1 1 | (2^64 + 1)G
// 4 | 0 1 0 0 | 2^128G
// 5 | 0 1 0 1 | (2^128 + 1)G
// 6 | 0 1 1 0 | (2^128 + 2^64)G
// 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
// 8 | 1 0 0 0 | 2^192G
// 9 | 1 0 0 1 | (2^192 + 1)G
// 10 | 1 0 1 0 | (2^192 + 2^64)G
// 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
// 12 | 1 1 0 0 | (2^192 + 2^128)G
// 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
// 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
// 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
// followed by a copy of this with each element multiplied by 2^32.
//
// The reason for this is so that we can clock bits into four different
// locations when doing simple scalar multiplies against the base point,
// and then another four locations using the second 16 elements.
//
// Tables for other points have table[i] = iG for i in 0 .. 16.
// fiat_p256_g_pre_comp is the table of precomputed base points
#if defined(OPENSSL_64_BIT)
static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = {
{{{0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510,
0x18905f76a53755c6},
{0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325,
0x8571ff1825885d85}},
{{0x4f922fc516a0d2bb, 0x0d5cc16c1a623499, 0x9241cf3a57c62c8b,
0x2f5e6961fd1b667f},
{0x5c15c70bf5a01797, 0x3d20b44d60956192, 0x04911b37071fdb52,
0xf648f9168d6f0f7b}},
{{0x9e566847e137bbbc, 0xe434469e8a6a0bec, 0xb1c4276179d73463,
0x5abe0285133d0015},
{0x92aa837cc04c7dab, 0x573d9f4c43260c07, 0x0c93156278e6cc37,
0x94bb725b6b6f7383}},
{{0x62a8c244bfe20925, 0x91c19ac38fdce867, 0x5a96a5d5dd387063,
0x61d587d421d324f6},
{0xe87673a2a37173ea, 0x2384800853778b65, 0x10f8441e05bab43e,
0xfa11fe124621efbe}},
{{0x1c891f2b2cb19ffd, 0x01ba8d5bb1923c23, 0xb6d03d678ac5ca8e,
0x586eb04c1f13bedc},
{0x0c35c6e527e8ed09, 0x1e81a33c1819ede2, 0x278fd6c056c652fa,
0x19d5ac0870864f11}},
{{0x62577734d2b533d5, 0x673b8af6a1bdddc0, 0x577e7c9aa79ec293,
0xbb6de651c3b266b1},
{0xe7e9303ab65259b3, 0xd6a0afd3d03a7480, 0xc5ac83d19b3cfc27,
0x60b4619a5d18b99b}},
{{0xbd6a38e11ae5aa1c, 0xb8b7652b49e73658, 0x0b130014ee5f87ed,
0x9d0f27b2aeebffcd},
{0xca9246317a730a55, 0x9c955b2fddbbc83a, 0x07c1dfe0ac019a71,
0x244a566d356ec48d}},
{{0x56f8410ef4f8b16a, 0x97241afec47b266a, 0x0a406b8e6d9c87c1,
0x803f3e02cd42ab1b},
{0x7f0309a804dbec69, 0xa83b85f73bbad05f, 0xc6097273ad8e197f,
0xc097440e5067adc1}},
{{0x846a56f2c379ab34, 0xa8ee068b841df8d1, 0x20314459176c68ef,
0xf1af32d5915f1f30},
{0x99c375315d75bd50, 0x837cffbaf72f67bc, 0x0613a41848d7723f,
0x23d0f130e2d41c8b}},
{{0xed93e225d5be5a2b, 0x6fe799835934f3c6, 0x4314092622626ffc,
0x50bbb4d97990216a},
{0x378191c6e57ec63e, 0x65422c40181dcdb2, 0x41a8099b0236e0f6,
0x2b10011801fe49c3}},
{{0xfc68b5c59b391593, 0xc385f5a2598270fc, 0x7144f3aad19adcbb,
0xdd55899983fbae0c},
{0x93b88b8e74b82ff4, 0xd2e03c4071e734c9, 0x9a7a9eaf43c0322a,
0xe6e4c551149d6041}},
{{0x5fe14bfe80ec21fe, 0xf6ce116ac255be82, 0x98bc5a072f4a5d67,
0xfad27148db7e63af},
{0x90c0b6ac29ab05b3, 0x37a9a83c4e251ae6, 0x0a7dc875c2aade7d,
0x77387de39f0e1a84}},
{{0x1e9ecc49a56c0dd7, 0xa5cffcd846086c74, 0x8f7a1408f505aece,
0xb37b85c0bef0c47e},
{0x3596b6e4cc0e6a8f, 0xfd6d4bbf6b388f23, 0xaba453fac39cef4e,
0x9c135ac8f9f628d5}},
{{0x0a1c729495c8f8be, 0x2961c4803bf362bf, 0x9e418403df63d4ac,
0xc109f9cb91ece900},
{0xc2d095d058945705, 0xb9083d96ddeb85c0, 0x84692b8d7a40449b,
0x9bc3344f2eee1ee1}},
{{0x0d5ae35642913074, 0x55491b2748a542b1, 0x469ca665b310732a,
0x29591d525f1a4cc1},
{0xe76f5b6bb84f983f, 0xbe7eef419f5f84e1, 0x1200d49680baa189,
0x6376551f18ef332c}}},
{{{0x202886024147519a, 0xd0981eac26b372f0, 0xa9d4a7caa785ebc8,
0xd953c50ddbdf58e9},
{0x9d6361ccfd590f8f, 0x72e9626b44e6c917, 0x7fd9611022eb64cf,
0x863ebb7e9eb288f3}},
{{0x4fe7ee31b0e63d34, 0xf4600572a9e54fab, 0xc0493334d5e7b5a4,
0x8589fb9206d54831},
{0xaa70f5cc6583553a, 0x0879094ae25649e5, 0xcc90450710044652,
0xebb0696d02541c4f}},
{{0xabbaa0c03b89da99, 0xa6f2d79eb8284022, 0x27847862b81c05e8,
0x337a4b5905e54d63},
{0x3c67500d21f7794a, 0x207005b77d6d7f61, 0x0a5a378104cfd6e8,
0x0d65e0d5f4c2fbd6}},
{{0xd433e50f6d3549cf, 0x6f33696ffacd665e, 0x695bfdacce11fcb4,
0x810ee252af7c9860},
{0x65450fe17159bb2c, 0xf7dfbebe758b357b, 0x2b057e74d69fea72,
0xd485717a92731745}},
{{0xce1f69bbe83f7669, 0x09f8ae8272877d6b, 0x9548ae543244278d,
0x207755dee3c2c19c},
{0x87bd61d96fef1945, 0x18813cefb12d28c3, 0x9fbcd1d672df64aa,
0x48dc5ee57154b00d}},
{{0xef0f469ef49a3154, 0x3e85a5956e2b2e9a, 0x45aaec1eaa924a9c,
0xaa12dfc8a09e4719},
{0x26f272274df69f1d, 0xe0e4c82ca2ff5e73, 0xb9d8ce73b7a9dd44,
0x6c036e73e48ca901}},
{{0xe1e421e1a47153f0, 0xb86c3b79920418c9, 0x93bdce87705d7672,
0xf25ae793cab79a77},
{0x1f3194a36d869d0c, 0x9d55c8824986c264, 0x49fb5ea3096e945e,
0x39b8e65313db0a3e}},
{{0xe3417bc035d0b34a, 0x440b386b8327c0a7, 0x8fb7262dac0362d1,
0x2c41114ce0cdf943},
{0x2ba5cef1ad95a0b1, 0xc09b37a867d54362, 0x26d6cdd201e486c9,
0x20477abf42ff9297}},
{{0x0f121b41bc0a67d2, 0x62d4760a444d248a, 0x0e044f1d659b4737,
0x08fde365250bb4a8},
{0xaceec3da848bf287, 0xc2a62182d3369d6e, 0x3582dfdc92449482,
0x2f7e2fd2565d6cd7}},
{{0x0a0122b5178a876b, 0x51ff96ff085104b4, 0x050b31ab14f29f76,
0x84abb28b5f87d4e6},
{0xd5ed439f8270790a, 0x2d6cb59d85e3f46b, 0x75f55c1b6c1e2212,
0xe5436f6717655640}},
{{0xc2965ecc9aeb596d, 0x01ea03e7023c92b4, 0x4704b4b62e013961,
0x0ca8fd3f905ea367},
{0x92523a42551b2b61, 0x1eb7a89c390fcd06, 0xe7f1d2be0392a63e,
0x96dca2644ddb0c33}},
{{0x231c210e15339848, 0xe87a28e870778c8d, 0x9d1de6616956e170,
0x4ac3c9382bb09c0b},
{0x19be05516998987d, 0x8b2376c4ae09f4d6, 0x1de0b7651a3f933d,
0x380d94c7e39705f4}},
{{0x3685954b8c31c31d, 0x68533d005bf21a0c, 0x0bd7626e75c79ec9,
0xca17754742c69d54},
{0xcc6edafff6d2dbb2, 0xfd0d8cbd174a9d18, 0x875e8793aa4578e8,
0xa976a7139cab2ce6}},
{{0xce37ab11b43ea1db, 0x0a7ff1a95259d292, 0x851b02218f84f186,
0xa7222beadefaad13},
{0xa2ac78ec2b0a9144, 0x5a024051f2fa59c5, 0x91d1eca56147ce38,
0xbe94d523bc2ac690}},
{{0x2d8daefd79ec1a0f, 0x3bbcd6fdceb39c97, 0xf5575ffc58f61a95,
0xdbd986c4adf7b420},
{0x81aa881415f39eb7, 0x6ee2fcf5b98d976c, 0x5465475dcf2f717d,
0x8e24d3c46860bbd0}}}};
#else
static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = {
{{{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b,
0xa53755c6, 0x18905f76},
{0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688,
0x25885d85, 0x8571ff18}},
{{0x16a0d2bb, 0x4f922fc5, 0x1a623499, 0x0d5cc16c, 0x57c62c8b, 0x9241cf3a,
0xfd1b667f, 0x2f5e6961},
{0xf5a01797, 0x5c15c70b, 0x60956192, 0x3d20b44d, 0x071fdb52, 0x04911b37,
0x8d6f0f7b, 0xf648f916}},
{{0xe137bbbc, 0x9e566847, 0x8a6a0bec, 0xe434469e, 0x79d73463, 0xb1c42761,
0x133d0015, 0x5abe0285},
{0xc04c7dab, 0x92aa837c, 0x43260c07, 0x573d9f4c, 0x78e6cc37, 0x0c931562,
0x6b6f7383, 0x94bb725b}},
{{0xbfe20925, 0x62a8c244, 0x8fdce867, 0x91c19ac3, 0xdd387063, 0x5a96a5d5,
0x21d324f6, 0x61d587d4},
{0xa37173ea, 0xe87673a2, 0x53778b65, 0x23848008, 0x05bab43e, 0x10f8441e,
0x4621efbe, 0xfa11fe12}},
{{0x2cb19ffd, 0x1c891f2b, 0xb1923c23, 0x01ba8d5b, 0x8ac5ca8e, 0xb6d03d67,
0x1f13bedc, 0x586eb04c},
{0x27e8ed09, 0x0c35c6e5, 0x1819ede2, 0x1e81a33c, 0x56c652fa, 0x278fd6c0,
0x70864f11, 0x19d5ac08}},
{{0xd2b533d5, 0x62577734, 0xa1bdddc0, 0x673b8af6, 0xa79ec293, 0x577e7c9a,
0xc3b266b1, 0xbb6de651},
{0xb65259b3, 0xe7e9303a, 0xd03a7480, 0xd6a0afd3, 0x9b3cfc27, 0xc5ac83d1,
0x5d18b99b, 0x60b4619a}},
{{0x1ae5aa1c, 0xbd6a38e1, 0x49e73658, 0xb8b7652b, 0xee5f87ed, 0x0b130014,
0xaeebffcd, 0x9d0f27b2},
{0x7a730a55, 0xca924631, 0xddbbc83a, 0x9c955b2f, 0xac019a71, 0x07c1dfe0,
0x356ec48d, 0x244a566d}},
{{0xf4f8b16a, 0x56f8410e, 0xc47b266a, 0x97241afe, 0x6d9c87c1, 0x0a406b8e,
0xcd42ab1b, 0x803f3e02},
{0x04dbec69, 0x7f0309a8, 0x3bbad05f, 0xa83b85f7, 0xad8e197f, 0xc6097273,
0x5067adc1, 0xc097440e}},
{{0xc379ab34, 0x846a56f2, 0x841df8d1, 0xa8ee068b, 0x176c68ef, 0x20314459,
0x915f1f30, 0xf1af32d5},
{0x5d75bd50, 0x99c37531, 0xf72f67bc, 0x837cffba, 0x48d7723f, 0x0613a418,
0xe2d41c8b, 0x23d0f130}},
{{0xd5be5a2b, 0xed93e225, 0x5934f3c6, 0x6fe79983, 0x22626ffc, 0x43140926,
0x7990216a, 0x50bbb4d9},
{0xe57ec63e, 0x378191c6, 0x181dcdb2, 0x65422c40, 0x0236e0f6, 0x41a8099b,
0x01fe49c3, 0x2b100118}},
{{0x9b391593, 0xfc68b5c5, 0x598270fc, 0xc385f5a2, 0xd19adcbb, 0x7144f3aa,
0x83fbae0c, 0xdd558999},
{0x74b82ff4, 0x93b88b8e, 0x71e734c9, 0xd2e03c40, 0x43c0322a, 0x9a7a9eaf,
0x149d6041, 0xe6e4c551}},
{{0x80ec21fe, 0x5fe14bfe, 0xc255be82, 0xf6ce116a, 0x2f4a5d67, 0x98bc5a07,
0xdb7e63af, 0xfad27148},
{0x29ab05b3, 0x90c0b6ac, 0x4e251ae6, 0x37a9a83c, 0xc2aade7d, 0x0a7dc875,
0x9f0e1a84, 0x77387de3}},
{{0xa56c0dd7, 0x1e9ecc49, 0x46086c74, 0xa5cffcd8, 0xf505aece, 0x8f7a1408,
0xbef0c47e, 0xb37b85c0},
{0xcc0e6a8f, 0x3596b6e4, 0x6b388f23, 0xfd6d4bbf, 0xc39cef4e, 0xaba453fa,
0xf9f628d5, 0x9c135ac8}},
{{0x95c8f8be, 0x0a1c7294, 0x3bf362bf, 0x2961c480, 0xdf63d4ac, 0x9e418403,
0x91ece900, 0xc109f9cb},
{0x58945705, 0xc2d095d0, 0xddeb85c0, 0xb9083d96, 0x7a40449b, 0x84692b8d,
0x2eee1ee1, 0x9bc3344f}},
{{0x42913074, 0x0d5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665,
0x5f1a4cc1, 0x29591d52},
{0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496,
0x18ef332c, 0x6376551f}}},
{{{0x4147519a, 0x20288602, 0x26b372f0, 0xd0981eac, 0xa785ebc8, 0xa9d4a7ca,
0xdbdf58e9, 0xd953c50d},
{0xfd590f8f, 0x9d6361cc, 0x44e6c917, 0x72e9626b, 0x22eb64cf, 0x7fd96110,
0x9eb288f3, 0x863ebb7e}},
{{0xb0e63d34, 0x4fe7ee31, 0xa9e54fab, 0xf4600572, 0xd5e7b5a4, 0xc0493334,
0x06d54831, 0x8589fb92},
{0x6583553a, 0xaa70f5cc, 0xe25649e5, 0x0879094a, 0x10044652, 0xcc904507,
0x02541c4f, 0xebb0696d}},
{{0x3b89da99, 0xabbaa0c0, 0xb8284022, 0xa6f2d79e, 0xb81c05e8, 0x27847862,
0x05e54d63, 0x337a4b59},
{0x21f7794a, 0x3c67500d, 0x7d6d7f61, 0x207005b7, 0x04cfd6e8, 0x0a5a3781,
0xf4c2fbd6, 0x0d65e0d5}},
{{0x6d3549cf, 0xd433e50f, 0xfacd665e, 0x6f33696f, 0xce11fcb4, 0x695bfdac,
0xaf7c9860, 0x810ee252},
{0x7159bb2c, 0x65450fe1, 0x758b357b, 0xf7dfbebe, 0xd69fea72, 0x2b057e74,
0x92731745, 0xd485717a}},
{{0xe83f7669, 0xce1f69bb, 0x72877d6b, 0x09f8ae82, 0x3244278d, 0x9548ae54,
0xe3c2c19c, 0x207755de},
{0x6fef1945, 0x87bd61d9, 0xb12d28c3, 0x18813cef, 0x72df64aa, 0x9fbcd1d6,
0x7154b00d, 0x48dc5ee5}},
{{0xf49a3154, 0xef0f469e, 0x6e2b2e9a, 0x3e85a595, 0xaa924a9c, 0x45aaec1e,
0xa09e4719, 0xaa12dfc8},
{0x4df69f1d, 0x26f27227, 0xa2ff5e73, 0xe0e4c82c, 0xb7a9dd44, 0xb9d8ce73,
0xe48ca901, 0x6c036e73}},
{{0xa47153f0, 0xe1e421e1, 0x920418c9, 0xb86c3b79, 0x705d7672, 0x93bdce87,
0xcab79a77, 0xf25ae793},
{0x6d869d0c, 0x1f3194a3, 0x4986c264, 0x9d55c882, 0x096e945e, 0x49fb5ea3,
0x13db0a3e, 0x39b8e653}},
{{0x35d0b34a, 0xe3417bc0, 0x8327c0a7, 0x440b386b, 0xac0362d1, 0x8fb7262d,
0xe0cdf943, 0x2c41114c},
{0xad95a0b1, 0x2ba5cef1, 0x67d54362, 0xc09b37a8, 0x01e486c9, 0x26d6cdd2,
0x42ff9297, 0x20477abf}},
{{0xbc0a67d2, 0x0f121b41, 0x444d248a, 0x62d4760a, 0x659b4737, 0x0e044f1d,
0x250bb4a8, 0x08fde365},
{0x848bf287, 0xaceec3da, 0xd3369d6e, 0xc2a62182, 0x92449482, 0x3582dfdc,
0x565d6cd7, 0x2f7e2fd2}},
{{0x178a876b, 0x0a0122b5, 0x085104b4, 0x51ff96ff, 0x14f29f76, 0x050b31ab,
0x5f87d4e6, 0x84abb28b},
{0x8270790a, 0xd5ed439f, 0x85e3f46b, 0x2d6cb59d, 0x6c1e2212, 0x75f55c1b,
0x17655640, 0xe5436f67}},
{{0x9aeb596d, 0xc2965ecc, 0x023c92b4, 0x01ea03e7, 0x2e013961, 0x4704b4b6,
0x905ea367, 0x0ca8fd3f},
{0x551b2b61, 0x92523a42, 0x390fcd06, 0x1eb7a89c, 0x0392a63e, 0xe7f1d2be,
0x4ddb0c33, 0x96dca264}},
{{0x15339848, 0x231c210e, 0x70778c8d, 0xe87a28e8, 0x6956e170, 0x9d1de661,
0x2bb09c0b, 0x4ac3c938},
{0x6998987d, 0x19be0551, 0xae09f4d6, 0x8b2376c4, 0x1a3f933d, 0x1de0b765,
0xe39705f4, 0x380d94c7}},
{{0x8c31c31d, 0x3685954b, 0x5bf21a0c, 0x68533d00, 0x75c79ec9, 0x0bd7626e,
0x42c69d54, 0xca177547},
{0xf6d2dbb2, 0xcc6edaff, 0x174a9d18, 0xfd0d8cbd, 0xaa4578e8, 0x875e8793,
0x9cab2ce6, 0xa976a713}},
{{0xb43ea1db, 0xce37ab11, 0x5259d292, 0x0a7ff1a9, 0x8f84f186, 0x851b0221,
0xdefaad13, 0xa7222bea},
{0x2b0a9144, 0xa2ac78ec, 0xf2fa59c5, 0x5a024051, 0x6147ce38, 0x91d1eca5,
0xbc2ac690, 0xbe94d523}},
{{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc,
0xadf7b420, 0xdbd986c4},
{0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d,
0x6860bbd0, 0x8e24d3c4}}}};
#endif

258
vendor/ring/crypto/fipsmodule/ec/util.h vendored Normal file
View File

@@ -0,0 +1,258 @@
// Copyright 2015 The BoringSSL Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ring-core/base.h>
#include "../../internal.h"
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wconversion"
#pragma GCC diagnostic ignored "-Wsign-conversion"
#endif
// This function looks at 5+1 scalar bits (5 current, 1 adjacent less
// significant bit), and recodes them into a signed digit for use in fast point
// multiplication: the use of signed rather than unsigned digits means that
// fewer points need to be precomputed, given that point inversion is easy (a
// precomputed point dP makes -dP available as well).
//
// BACKGROUND:
//
// Signed digits for multiplication were introduced by Booth ("A signed binary
// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
// Booth's original encoding did not generally improve the density of nonzero
// digits over the binary representation, and was merely meant to simplify the
// handling of signed factors given in two's complement; but it has since been
// shown to be the basis of various signed-digit representations that do have
// further advantages, including the wNAF, using the following general
// approach:
//
// (1) Given a binary representation
//
// b_k ... b_2 b_1 b_0,
//
// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
// by using bit-wise subtraction as follows:
//
// b_k b_(k-1) ... b_2 b_1 b_0
// - b_k ... b_3 b_2 b_1 b_0
// -----------------------------------------
// s_(k+1) s_k ... s_3 s_2 s_1 s_0
//
// A left-shift followed by subtraction of the original value yields a new
// representation of the same value, using signed bits s_i = b_(i-1) - b_i.
// This representation from Booth's paper has since appeared in the
// literature under a variety of different names including "reversed binary
// form", "alternating greedy expansion", "mutual opposite form", and
// "sign-alternating {+-1}-representation".
//
// An interesting property is that among the nonzero bits, values 1 and -1
// strictly alternate.
//
// (2) Various window schemes can be applied to the Booth representation of
// integers: for example, right-to-left sliding windows yield the wNAF
// (a signed-digit encoding independently discovered by various researchers
// in the 1990s), and left-to-right sliding windows yield a left-to-right
// equivalent of the wNAF (independently discovered by various researchers
// around 2004).
//
// To prevent leaking information through side channels in point multiplication,
// we need to recode the given integer into a regular pattern: sliding windows
// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
// decades older: we'll be using the so-called "modified Booth encoding" due to
// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
// (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five
// signed bits into a signed digit:
//
// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j)
//
// The sign-alternating property implies that the resulting digit values are
// integers from -16 to 16.
//
// Of course, we don't actually need to compute the signed digits s_i as an
// intermediate step (that's just a nice way to see how this scheme relates
// to the wNAF): a direct computation obtains the recoded digit from the
// six bits b_(5j + 4) ... b_(5j - 1).
//
// This function takes those six bits as an integer (0 .. 63), writing the
// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
// value, in the range 0 .. 16). Note that this integer essentially provides
// the input bits "shifted to the left" by one position: for example, the input
// to compute the least significant recoded digit, given that there's no bit
// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0.
//
// DOUBLING CASE:
//
// Point addition formulas for short Weierstrass curves are often incomplete.
// Edge cases such as P + P or P + ∞ must be handled separately. This
// complicates constant-time requirements. P + ∞ cannot be avoided (any window
// may be zero) and is handled with constant-time selects. P + P (where P is not
// ∞) usually is not. Instead, windowing strategies are chosen to avoid this
// case. Whether this happens depends on the group order.
//
// Let w be the window width (in this function, w = 5). The non-trivial doubling
// case in single-point scalar multiplication may occur if and only if the
// 2^(w-1) bit of the group order is zero.
//
// Note the above only holds if the scalar is fully reduced and the group order
// is a prime that is much larger than 2^w. It also only holds when windows
// are applied from most significant to least significant, doubling between each
// window. It does not apply to more complex table strategies such as
// |EC_nistz256_method|.
//
// PROOF:
//
// Let n be the group order. Let l be the number of bits needed to represent n.
// Assume there exists some 0 <= k < n such that signed w-bit windowed
// multiplication hits the doubling case.
//
// Windowed multiplication consists of iterating over groups of s_i (defined
// above based on k's binary representation) from most to least significant. At
// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant
// window), we:
//
// 1. Double the accumulator A, w times. Let A_i be the value of A at this
// point.
//
// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P
// corresponding to the window s_(i+w-1) ... s_i.
//
// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as
// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i.
// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is
// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i =
// 2^w * (a_(i+w) + t_(i+w)).
//
// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it
// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i.
// This is computed as:
//
// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1)
// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i
// --------------------------------------------
// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i
//
// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer
// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i.
//
// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x)
// = x - 2^(w-1)*b_(i+w-1) + b_(i-1)
//
// Or, using C notation for bit operations:
//
// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1
//
// Note b_(i-1) is added in left-shifted by one (or doubled) from its place.
// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed
// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C
// notation, this is:
//
// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w
//
// Observe that, while t_i may be positive or negative, a_i is bounded by
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
// all groups. That would imply the subsequent a_i is zero, which means all
// terms thus far were zero.)
//
// Returning to our doubling position, we have a_j = t_j (mod n). We now
// determine the value of a_j - t_j, which must be divisible by n. Our bounds on
// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w
// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if
// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n.
//
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
//
// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j
// <= k/2^j + 2^w - t_j
// < n/2^w + 2^w + 2^(w-1)
//
// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final
// addition may hit the doubling case.
//
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the
// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0.
// That is:
//
// - 2^w divides k_H
// - k_M is 0 or 2^(w-1)
// - 0 <= k_L < 2^(w-1)
//
// Divide n into n_H + n_M + n_L similarly. We thus have:
//
// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1
// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1))
// = k_L - k_M
//
// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w
// = (k>>w) << w + ((k>>(w-1)) & 1) << w
// = k_H + 2*k_M
//
// n = a_0 - t_0
// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M)
// = k_H + 3*k_M - k_L
//
// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be
// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H.
// Then,
//
// n_M + n_L = 3*(2^(w-1)) - k_L
// > 3*(2^(w-1)) - 2^(w-1)
// = 2^w
//
// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose
// k_H < n_H - 2*2^w. Then,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// < n_H - 2*2^w + 3*(2^(w-1)) - k_L
// n_M + n_L < -2^(w-1) - k_L
//
// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus,
//
// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L
// = n_H - 2^w + 3*(2^(w-1)) - k_L
// n_M + n_L = 2^(w-1) - k_L
// <= 2^(w-1)
//
// Equality would mean 2^(w-1) divides n, which is impossible if n is prime.
// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition.
//
// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w,
// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point
// doubling in the final addition and is the only such scalar.
//
// COMMON CURVES:
//
// The group orders for common curves end in the following bit patterns:
//
// P-521: ...00001001; w = 4 is okay
// P-384: ...01110011; w = 2, 5, 6, 7 are okay
// P-256: ...01010001; w = 5, 7 are okay
// P-224: ...00111101; w = 3, 4, 5, 6 are okay
static inline void recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit,
crypto_word_t in) {
crypto_word_t s, d;
s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as
* 6-bit value */
d = (1 << 6) - in - 1;
d = (d & s) | (in & ~s);
d = (d >> 1) + (d & 1);
*sign = s & 1;
*digit = d;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,625 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
# ifndef __ARMEB__
rev $t1,$t1
# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#ifdef __KERNEL__
# define __ARM_ARCH __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
@ instructions are manually-encoded. (See unsha256.)
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
.align 5
.global sha256_block_data_order_nohw
.type sha256_block_data_order_nohw,%function
sha256_block_data_order_nohw:
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
adr $Ktbl,K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
#if __ARM_ARCH>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.LK256_shortcut_neon:
@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
#if defined(__thumb2__)
.word K256-(.LK256_add_neon+4)
#else
.word K256-(.LK256_add_neon+8)
#endif
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 5
.skip 16
sha256_block_data_order_neon:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
@ K256 is just at the boundary of being easily referenced by an ADR from
@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
@ not fit. By moving code around, we could make it fit, but this is too
@ fragile. For simplicity, just load the offset from
@ .LK256_shortcut_neon.
@
@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
@ support it. We might be able to emulate it with a macro, but Android's
@ did not work when I tried it.
@ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm
ldr $Ktbl,.LK256_shortcut_neon
.LK256_add_neon:
add $Ktbl,pc,$Ktbl
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,649 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012.
#
# Improve NEON performance by 12% on Snapdragon S4. In absolute
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
#if __ARM_ARCH>=7
it eq @ Thumb2 thing, sanity check in ARM
#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#ifndef __KERNEL__
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
#else
# define __ARM_MAX_ARCH__ 7
# define VFP_ABI_PUSH
# define VFP_ABI_POP
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
#if defined(__thumb2__)
.syntax unified
.thumb
# define adrl adr
#else
.code 32
#endif
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.global sha512_block_data_order_nohw
.type sha512_block_data_order_nohw,%function
sha512_block_data_order_nohw:
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
adr $Ktbl,K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
#if __ARM_ARCH>=7
ittt eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
#if $i>0
vadd.i64 $a,$Maj @ h+=Maj from the past
#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
veor $t1,$t0
vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
veor $t2,$t1 @ Sigma1(e)
vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vadd.i64 $T1,$t2
vshr.u64 $t2,$a,#@Sigma0[2]
vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
veor $h,$t0,$t1
vadd.i64 $T1,$K
vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
vadd.i64 $d,$T1
vadd.i64 $Maj,$T1
@ vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha512_block_data_order_neon
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
VFP_ABI_POP
ret @ bx lr
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
print $code;
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,575 @@
#! /usr/bin/env perl
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
my ($flavour, $output) = @ARGV;
if ($output =~ /sha512-armv8/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$func="sha${BITS}_block_data_order_nohw";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __AARCH64EB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#ifndef __KERNEL__
#endif
.text
.globl $func
.type $func,%function
.align 6
$func:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adrp $Ktbl,:pg_hi21:.LK$BITS
add $Ktbl,$Ktbl,:lo12:.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.size $func,.-$func
.section .rodata
.align 6
.type .LK$BITS,%object
.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size .LK$BITS,.-.LK$BITS
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.text
#ifndef __KERNEL__
.globl sha256_block_data_order_hw
.type sha256_block_data_order_hw,%function
.align 6
sha256_block_data_order_hw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adrp $Ktbl,:pg_hi21:.LK256
add $Ktbl,$Ktbl,:lo12:.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}
if ($SZ==8) {
my $Ktbl="x3";
my @H = map("v$_.16b",(0..4));
my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
my @MSG=map("v$_.16b",(16..23));
my ($W0,$W1)=("v24.2d","v25.2d");
my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
$code.=<<___;
.text
#ifndef __KERNEL__
.globl sha512_block_data_order_hw
.type sha512_block_data_order_hw,%function
.align 6
sha512_block_data_order_hw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input
ld1 {@MSG[4]-@MSG[7]},[$inp],#64
ld1.64 {@H[0]-@H[3]},[$ctx] // load context
adrp $Ktbl,:pg_hi21:.LK512
add $Ktbl,$Ktbl,:lo12:.LK512
rev64 @MSG[0],@MSG[0]
rev64 @MSG[1],@MSG[1]
rev64 @MSG[2],@MSG[2]
rev64 @MSG[3],@MSG[3]
rev64 @MSG[4],@MSG[4]
rev64 @MSG[5],@MSG[5]
rev64 @MSG[6],@MSG[6]
rev64 @MSG[7],@MSG[7]
b .Loop_hw
.align 4
.Loop_hw:
ld1.64 {$W0},[$Ktbl],#16
subs $num,$num,#1
sub x4,$inp,#128
orr $AB,@H[0],@H[0] // offload
orr $CD,@H[1],@H[1]
orr $EF,@H[2],@H[2]
orr $GH,@H[3],@H[3]
csel $inp,$inp,x4,ne // conditional rewind
___
for($i=0;$i<32;$i++) {
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1.64 {$W1},[$Ktbl],#16
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512su0 @MSG[0],@MSG[1]
ext $m9_10,@MSG[4],@MSG[5],#8
sha512h @H[3],$fg,$de
sha512su1 @MSG[0],@MSG[7],$m9_10
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
for(;$i<40;$i++) {
$code.=<<___ if ($i<39);
ld1.64 {$W1},[$Ktbl],#16
___
$code.=<<___ if ($i==39);
sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind
___
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1 {@MSG[0]},[$inp],#16 // load next input
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512h @H[3],$fg,$de
rev64 @MSG[0],@MSG[0]
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
$code.=<<___;
add.i64 @H[0],@H[0],$AB // accumulate
add.i64 @H[1],@H[1],$CD
add.i64 @H[2],@H[2],$EF
add.i64 @H[3],@H[3],$GH
cbnz $num,.Loop_hw
st1.64 {@H[0]-@H[3]},[$ctx] // store context
ldr x29,[sp],#16
ret
.size sha512_block_data_order_hw,.-sha512_block_data_order_hw
#endif
___
}
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
{ my %opcode = (
"sha512h" => 0xce608000, "sha512h2" => 0xce608400,
"sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
sub unsha512 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.[ui]?8(\s)/$1/;
s/\.\w?64\b// and s/\.16b/\.2d/g or
s/\.\w?32\b// and s/\.16b/\.4s/g;
m/\bext\b/ and s/\.2d/\.16b/g or
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff