cli/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/ec/ec_nistp.c

// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

// In this file we will implement elliptic curve point operations for
// NIST curves P-256, P-384, and P-521. The idea is to implement the operations
// in a generic way such that the code can be reused instead of having
// a separate implementation for each of the curves. We implement:
//   1. point addition,
//   2. point doubling,
//   3. scalar multiplication of a base point,
//   4. scalar multiplication of an arbitrary point,
//   5. scalar multiplication of a base and an arbitrary point.
//
// Matrix of what has been done so far:
//
// | op | P-521 | P-384 | P-256 |
// |----------------------------|
// | 1. |   x   |   x   |   x*  |
// | 2. |   x   |   x   |   x*  |
// | 3. |   x   |   x   |   x*  |
// | 4. |   x   |   x   |   x*  |
// | 5. |   x   |   x   |   x*  |
//  * For P-256, only the Fiat-crypto implementation in p256.c is replaced.

#include "ec_nistp.h"

// Some of the functions below need temporary field element variables.
// To avoid dynamic allocation we define nistp_felem type to have the maximum
// size possible (which is currently P-521 curve). The values are hard-coded
// for the moment, this will be fixed when we migrate the whole P-521
// implementation to ec_nistp.c.
#if defined(EC_NISTP_USE_64BIT_LIMB)
#define FELEM_MAX_NUM_OF_LIMBS (9)
#else
#define FELEM_MAX_NUM_OF_LIMBS (19)
#endif
typedef ec_nistp_felem_limb ec_nistp_felem[FELEM_MAX_NUM_OF_LIMBS];

// Conditional copy in constant-time (out = t == 0 ? z : nz).
static void cmovznz(ec_nistp_felem_limb *out,
                    size_t num_limbs,
                    ec_nistp_felem_limb t,
                    const ec_nistp_felem_limb *z,
                    const ec_nistp_felem_limb *nz) {
  ec_nistp_felem_limb mask = constant_time_is_zero_w(t);
  for (size_t i = 0; i < num_limbs; i++) {
    out[i] = constant_time_select_w(mask, z[i], nz[i]);
  }
}

// Group operations
// ----------------
//
// Building on top of the field operations we have the operations on the
// elliptic curve group itself. Points on the curve are represented in Jacobian
// coordinates.
//
// ec_nistp_point_double calculates 2*(x_in, y_in, z_in)
//
// The method is based on:
//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
// for which there is a Coq transcription and correctness proof:
//   <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L93>
//   <https://github.com/mit-plv/fiat-crypto/blob/79f8b5f39ed609339f0233098dee1a3c4e6b3080/src/Curves/Weierstrass/Jacobian.v#L201>
//
// However, we slighty changed the computation for efficiency (see the full
// explanation within the function body), which makes the Coq proof above
// not applicable to our implementation.
// TODO(awslc): Write a Coq correctness proof for our version of the algorithm.
//
// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed;
// while x_out == y_in is not (maybe this works, but it's not tested).
void ec_nistp_point_double(const ec_nistp_meth *ctx,
                           ec_nistp_felem_limb *x_out,
                           ec_nistp_felem_limb *y_out,
                           ec_nistp_felem_limb *z_out,
                           const ec_nistp_felem_limb *x_in,
                           const ec_nistp_felem_limb *y_in,
                           const ec_nistp_felem_limb *z_in) {
  ec_nistp_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta;
  // delta = z^2
  ctx->felem_sqr(delta, z_in);
  // gamma = y^2
  ctx->felem_sqr(gamma, y_in);
  // beta = x*gamma
  ctx->felem_mul(beta, x_in, gamma);

  // alpha = 3*(x-delta)*(x+delta)
  ctx->felem_sub(ftmp, x_in, delta);
  ctx->felem_add(ftmp2, x_in, delta);

  ctx->felem_add(tmptmp, ftmp2, ftmp2);
  ctx->felem_add(ftmp2, ftmp2, tmptmp);
  ctx->felem_mul(alpha, ftmp, ftmp2);

  // x' = alpha^2 - 8*beta
  ctx->felem_sqr(x_out, alpha);
  ctx->felem_add(fourbeta, beta, beta);
  ctx->felem_add(fourbeta, fourbeta, fourbeta);
  ctx->felem_add(tmptmp, fourbeta, fourbeta);
  ctx->felem_sub(x_out, x_out, tmptmp);

  // z' = (y + z)^2 - gamma - delta
  // The following calculation differs from the Coq proof cited above.
  // The proof is for:
  //   add(delta, gamma, delta);
  //   add(ftmp, y_in, z_in);
  //   square(z_out, ftmp);
  //   sub(z_out, z_out, delta);
  // Our operations sequence is a bit more efficient because it saves us
  // a certain number of conditional moves.
  ctx->felem_add(ftmp, y_in, z_in);
  ctx->felem_sqr(z_out, ftmp);
  ctx->felem_sub(z_out, z_out, gamma);
  ctx->felem_sub(z_out, z_out, delta);

  // y' = alpha*(4*beta - x') - 8*gamma^2
  ctx->felem_sub(y_out, fourbeta, x_out);
  ctx->felem_add(gamma, gamma, gamma);
  ctx->felem_sqr(gamma, gamma);
  ctx->felem_mul(y_out, alpha, y_out);
  ctx->felem_add(gamma, gamma, gamma);
  ctx->felem_sub(y_out, y_out, gamma);
}

// ec_nistp_point_add calculates (x1, y1, z1) + (x2, y2, z2)
//
// The method is taken from:
//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
//
// Coq transcription and correctness proof:
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
//
// This function includes a branch for checking whether the two input points
// are equal, (while not equal to the point at infinity). This case should
// never happen during single point multiplication, so there is no timing leak
// for ECDH and ECDSA.
void ec_nistp_point_add(const ec_nistp_meth *ctx,
                        ec_nistp_felem_limb *x3,
                        ec_nistp_felem_limb *y3,
                        ec_nistp_felem_limb *z3,
                        const ec_nistp_felem_limb *x1,
                        const ec_nistp_felem_limb *y1,
                        const ec_nistp_felem_limb *z1,
                        const int mixed,
                        const ec_nistp_felem_limb *x2,
                        const ec_nistp_felem_limb *y2,
                        const ec_nistp_felem_limb *z2) {
  ec_nistp_felem x_out, y_out, z_out;

  ec_nistp_felem_limb z1nz = ctx->felem_nz(z1);
  ec_nistp_felem_limb z2nz = ctx->felem_nz(z2);

  // z1z1 = z1**2
  ec_nistp_felem z1z1;
  ctx->felem_sqr(z1z1, z1);

  ec_nistp_felem u1, s1, two_z1z2;
  if (!mixed) {
    // z2z2 = z2**2
    ec_nistp_felem z2z2;
    ctx->felem_sqr(z2z2, z2);

    // u1 = x1*z2z2
    ctx->felem_mul(u1, x1, z2z2);

    // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2
    ctx->felem_add(two_z1z2, z1, z2);
    ctx->felem_sqr(two_z1z2, two_z1z2);
    ctx->felem_sub(two_z1z2, two_z1z2, z1z1);
    ctx->felem_sub(two_z1z2, two_z1z2, z2z2);

    // s1 = y1 * z2**3
    ctx->felem_mul(s1, z2, z2z2);
    ctx->felem_mul(s1, s1, y1);
  } else {
    // We'll assume z2 = 1 (special case z2 = 0 is handled later).

    // u1 = x1*z2z2
    OPENSSL_memcpy(u1, x1, ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb));
    // two_z1z2 = 2z1z2
    ctx->felem_add(two_z1z2, z1, z1);
    // s1 = y1 * z2**3
    OPENSSL_memcpy(s1, y1, ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb));
  }

  // u2 = x2*z1z1
  ec_nistp_felem u2;
  ctx->felem_mul(u2, x2, z1z1);

  // h = u2 - u1
  ec_nistp_felem h;
  ctx->felem_sub(h, u2, u1);

  ec_nistp_felem_limb xneq = ctx->felem_nz(h);

  // z_out = two_z1z2 * h
  ctx->felem_mul(z_out, h, two_z1z2);

  // z1z1z1 = z1 * z1z1
  ec_nistp_felem z1z1z1;
  ctx->felem_mul(z1z1z1, z1, z1z1);

  // s2 = y2 * z1**3
  ec_nistp_felem s2;
  ctx->felem_mul(s2, y2, z1z1z1);

  // r = (s2 - s1)*2
  ec_nistp_felem r;
  ctx->felem_sub(r, s2, s1);
  ctx->felem_add(r, r, r);

  ec_nistp_felem_limb yneq = ctx->felem_nz(r);

  // This case will never occur in the constant-time |ec_GFp_mont_mul|.
  ec_nistp_felem_limb is_nontrivial_double =
                                     constant_time_is_zero_w(xneq | yneq) &
                                    ~constant_time_is_zero_w(z1nz) &
                                    ~constant_time_is_zero_w(z2nz);
  if (constant_time_declassify_w(is_nontrivial_double)) {
    ec_nistp_point_double(ctx, x3, y3, z3, x1, y1, z1);
    return;
  }

  // I = (2h)**2
  ec_nistp_felem i;
  ctx->felem_add(i, h, h);
  ctx->felem_sqr(i, i);

  // J = h * I
  ec_nistp_felem j;
  ctx->felem_mul(j, h, i);

  // V = U1 * I
  ec_nistp_felem v;
  ctx->felem_mul(v, u1, i);

  // x_out = r**2 - J - 2V
  ctx->felem_sqr(x_out, r);
  ctx->felem_sub(x_out, x_out, j);
  ctx->felem_sub(x_out, x_out, v);
  ctx->felem_sub(x_out, x_out, v);

  // y_out = r(V-x_out) - 2 * s1 * J
  ctx->felem_sub(y_out, v, x_out);
  ctx->felem_mul(y_out, y_out, r);
  ec_nistp_felem s1j;
  ctx->felem_mul(s1j, s1, j);
  ctx->felem_sub(y_out, y_out, s1j);
  ctx->felem_sub(y_out, y_out, s1j);

  cmovznz(x_out, ctx->felem_num_limbs, z1nz, x2, x_out);
  cmovznz(y_out, ctx->felem_num_limbs, z1nz, y2, y_out);
  cmovznz(z_out, ctx->felem_num_limbs, z1nz, z2, z_out);
  cmovznz(x3, ctx->felem_num_limbs, z2nz, x1, x_out);
  cmovznz(y3, ctx->felem_num_limbs, z2nz, y1, y_out);
  cmovznz(z3, ctx->felem_num_limbs, z2nz, z1, z_out);
}

// Returns i-th bit of the scalar (zero or one).
// The caller is responsible for making sure i is within bounds of the scalar.
static int16_t get_bit(const EC_SCALAR *in, size_t i) {
// |in->words| is an array of BN_ULONGs which can be either 8 or 4 bytes long.
#if defined(OPENSSL_64_BIT)
  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) == 8, bn_ulong_not_eight_bytes);
  return (in->words[i >> 6] >> (i & 63)) & 1;
#else
  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) == 4, bn_ulong_not_four_bytes);
  return (in->words[i >> 5] >> (i & 31)) & 1;
#endif
}

#define DIV_AND_CEIL(a, b) ((a + b - 1) / b)

// Compute "regular" wNAF representation of a scalar, see
// Joye, Tunstall, "Exponent Recoding and Regular Exponentiation Algorithms",
// AfricaCrypt 2009, Alg 6.
// It forces an odd scalar and outputs digits in
// {\pm 1, \pm 3, \pm 5, \pm 7, \pm 9, ...}
// i.e. signed odd digits with _no zeroes_ -- that makes it "regular".
static void scalar_rwnaf(int16_t *out, size_t window_size,
                         const EC_SCALAR *scalar, size_t scalar_bit_size) {
  assert(window_size < 14);

  // The assert above ensures this works correctly.
  const int16_t window_mask = (1 << (window_size + 1)) - 1;
  int16_t window = (int16_t)(scalar->words[0] & (BN_ULONG)window_mask);
  window |= 1;

  const size_t num_windows = DIV_AND_CEIL(scalar_bit_size, window_size);
  for (size_t i = 0; i < num_windows - 1; i++) {
    int16_t d = (window & window_mask) - (int16_t)(1 << window_size);
    out[i] = d;
    window = (window - d) >> window_size;
    for (size_t j = 1; j <= window_size; j++) {
      size_t idx = (i + 1) * window_size + j;
      if (idx < scalar_bit_size) {
        window |= get_bit(scalar, idx) << j;
      }
    }
  }
  out[num_windows - 1] = window;
}

// The window size for scalar multiplication is hard coded for now.
#define SCALAR_MUL_WINDOW_SIZE (5)
#define SCALAR_MUL_TABLE_NUM_POINTS (1 << (SCALAR_MUL_WINDOW_SIZE - 1))

// To avoid dynamic allocation and freeing of memory in functions below
// we define maximum values of certain variables.
//
// The maximum number of limbs the table in |ec_nistp_scalar_mul| can have.
// Each point in the table has 3 coordinates that are field elements,
// and each field element has a defined maximum number of limbs.
#define SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS \
                (SCALAR_MUL_TABLE_NUM_POINTS * 3 * FELEM_MAX_NUM_OF_LIMBS)

// The maximum number of bits for a scalar.
#define SCALAR_MUL_MAX_SCALAR_BITS (521)

// Maximum number of windows (digits) for a scalar encoding which is
// determined by the maximum scalar bit size -- 521 bits in our case.
#define SCALAR_MUL_MAX_NUM_WINDOWS \
                DIV_AND_CEIL(SCALAR_MUL_MAX_SCALAR_BITS, SCALAR_MUL_WINDOW_SIZE)

// Generate table of multiples of the input point P = (x_in, y_in, z_in):
//  table <-- [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
static void generate_table(const ec_nistp_meth *ctx,
                           ec_nistp_felem_limb *table,
                           const ec_nistp_felem_limb *x_in,
                           const ec_nistp_felem_limb *y_in,
                           const ec_nistp_felem_limb *z_in)
{
  const size_t felem_num_limbs = ctx->felem_num_limbs;
  const size_t felem_num_bytes = felem_num_limbs * sizeof(ec_nistp_felem_limb);

  // Helper variables to access individual coordinates of a point.
  const size_t x_idx = 0;
  const size_t y_idx = felem_num_limbs;
  const size_t z_idx = felem_num_limbs * 2;

  // table[0] <-- P.
  OPENSSL_memcpy(&table[x_idx], x_in, felem_num_bytes);
  OPENSSL_memcpy(&table[y_idx], y_in, felem_num_bytes);
  OPENSSL_memcpy(&table[z_idx], z_in, felem_num_bytes);

  // Compute 2P.
  ec_nistp_felem x_in_dbl, y_in_dbl, z_in_dbl;
  ctx->point_dbl(x_in_dbl, y_in_dbl, z_in_dbl,
                 &table[x_idx], &table[y_idx], &table[z_idx]);

  // Compute the rest of the table.
  for (size_t i = 1; i < SCALAR_MUL_TABLE_NUM_POINTS; i++) {
    // Just getting pointers to i-th and (i-1)-th point in the table.
    ec_nistp_felem_limb *point_i = &table[i * 3 * felem_num_limbs];
    ec_nistp_felem_limb *point_im1 = &table[(i - 1) * 3 * felem_num_limbs];

    // table[i] <-- table[i - 1] + 2P
    ctx->point_add(&point_i[x_idx], &point_i[y_idx], &point_i[z_idx],
                   &point_im1[x_idx], &point_im1[y_idx], &point_im1[z_idx],
                   0, x_in_dbl, y_in_dbl, z_in_dbl);
  }
}

// Writes to out the idx-th point from table in constant-time.
static inline void select_point_from_table(const ec_nistp_meth *ctx,
                                           ec_nistp_felem_limb *out,
                                           const ec_nistp_felem_limb *table,
                                           const size_t idx,
                                           const size_t projective) {
  // if projective != 0 then a point is (x, y, z), otherwise (x, y).
  size_t point_num_coord = 2 + (projective != 0 ? 1 : 0);
  size_t point_num_limbs = ctx->felem_num_limbs * point_num_coord;

  // The ifdef branching below is temporary. Using only constant_..._table_8
  // would be best for simplicity, but unfortunatelly, on x86 systems it is
  // significantly slower than constant_..._table_w.
#if defined(EC_NISTP_USE_64BIT_LIMB) && defined(OPENSSL_64_BIT)
  constant_time_select_entry_from_table_w(out, (crypto_word_t*) table, idx,
          SCALAR_MUL_TABLE_NUM_POINTS, point_num_limbs);
#else
  size_t entry_size = point_num_limbs * sizeof(ec_nistp_felem_limb);
  constant_time_select_entry_from_table_8((uint8_t*)out, (uint8_t*)table,
          idx, SCALAR_MUL_TABLE_NUM_POINTS, entry_size);
#endif
}

// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
// The product is computed with the use of a small table generated on-the-fly
// and the scalar recoded in the regular-wNAF representation.
//
// The precomputed (on-the-fly) table |table| holds odd multiples of P:
//     [2i + 1]P for i in [0, SCALAR_MUL_TABLE_NUM_POINTS - 1].
// Computing the negation of a point P = (x, y, z) is relatively easy:
//     -P = (x, -y, z),
// so we may assume that for each point we have its negative as well.
//
// The scalar is recoded (regular-wNAF encoding) into signed digits as explained
// in |scalar_rwnaf| function. Namely, for a window size |w| we have:
//     scalar' = s_0 + s_1*2^w + s_2*2^(2*w) + ... + s_{m-1}*2^((m-1)*w),
// where digits s_i are in [\pm 1, \pm 3, ..., \pm (2^w-1)] and
// m = ceil(scalar_bit_size / w). Note that for an odd scalar we have that
// scalar = scalar', while in the case of an even scalar we have that
// scalar = scalar' - 1.
//
// The required product, [scalar]P, is computed by the following algorithm.
//     1. Initialize the accumulator with the point from |table|
//        corresponding to the most significant digit s_{m-1} of the scalar.
//     2. For digits s_i starting from s_{m-2} down to s_0:
//     3.   Double the accumulator w times. (note that doubling a point [a]P
//          w times results in [2^w*a]P).
//     4.   Read from |table| the point corresponding to abs(s_i),
//          negate it if s_i is negative, and add it to the accumulator.
//     5. Subtract P from the result if the scalar is even.
//
// Note: this function is constant-time.
void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
                         ec_nistp_felem_limb *x_out,
                         ec_nistp_felem_limb *y_out,
                         ec_nistp_felem_limb *z_out,
                         const ec_nistp_felem_limb *x_in,
                         const ec_nistp_felem_limb *y_in,
                         const ec_nistp_felem_limb *z_in,
                         const EC_SCALAR *scalar) {
  // Make sure that the max table size is large enough.
  assert(SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS >=
         SCALAR_MUL_TABLE_NUM_POINTS * ctx->felem_num_limbs * 3);

  // Table of multiples of P = (x_in, y_in, z_in).
  ec_nistp_felem_limb table[SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS];
  generate_table(ctx, table, x_in, y_in, z_in);

  // Regular-wNAF encoding of the scalar.
  int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
  scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);

  // We need two point accumulators, so we define them of maximum size
  // to avoid allocation, and just take pointers to individual coordinates.
  // (This cruft will dissapear when we refactor point_add/dbl to work with
  // whole points instead of individual coordinates).
  ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS];
  ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS];
  ec_nistp_felem_limb *x_res = &res[0];
  ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
  ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
  ec_nistp_felem_limb *x_tmp = &tmp[0];
  ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
  ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];

  // The actual number of windows (digits) of the scalar (denoted by m in the
  // description above the function).
  const size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);

  // Step 1. Initialize the accmulator (res) with the input point multiplied by
  // the most significant digit of the scalar s_{m-1} (note that this digit
  // can't be negative).
  int16_t idx = rwnaf[num_windows - 1];
  idx >>= 1;
  select_point_from_table(ctx, res, table, idx, 1);

  // Step 2. Process the remaining digits of the scalar (s_{m-2} to s_0).
  for (int i = num_windows - 2; i >= 0; i--) {
    // Step 3. Double the accumulator w times.
    for (size_t j = 0; j < SCALAR_MUL_WINDOW_SIZE; j++) {
      ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
    }

    // Step 4a. Compute abs(s_i).
    int16_t d = rwnaf[i];
    int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
    d = (d ^ -is_neg) + is_neg;     // d = abs(d)

    // Step 4b. Select from table the point corresponding to abs(s_i).
    idx = d >> 1;
    select_point_from_table(ctx, tmp, table, idx, 1);

    // Step 4c. Negate the point if s_i < 0.
    ec_nistp_felem ftmp;
    ctx->felem_neg(ftmp, y_tmp);

    cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);

    // Step 4d. Add the point to the accumulator.
    ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 0, x_tmp, y_tmp, z_tmp);
  }

  // Step 5a. Negate the input point P (we negate it in-place since we already
  // have it stored as the first entry in the table).
  ec_nistp_felem_limb *x_mp = &table[0];
  ec_nistp_felem_limb *y_mp = &table[ctx->felem_num_limbs];
  ec_nistp_felem_limb *z_mp = &table[ctx->felem_num_limbs * 2];
  ctx->felem_neg(y_mp, y_mp);

  // Step 5b. Subtract P from the accumulator.
  ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 0, x_mp, y_mp, z_mp);

  // Step 5c. Select |res| or |res - P| based on parity of the scalar.
  ec_nistp_felem_limb t = scalar->words[0] & 1;
  cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
  cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
  cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
}

// Multiplication of the base point G of the curve with the given scalar.
// The product is computed with the Comb method using a precomputed table
// and the regular-wNAF scalar encoding.
//
// While the algorithm is generic and works for different curves, window sizes,
// and scalar sizes, for clarity, we describe it by using the example of P-521.
//
// The precomputed table has 27 sub-tables each holding 16 points:
//
//      0 :       [1]G,       [3]G,  ...,       [31]G
//      1 :  [1*2^20]G,  [3*2^20]G,  ...,  [31*2^20]G
//                         ...
//      i : [1*2^20i]G, [3*2^20i]G,  ..., [31*2^20i]G
//                         ...
//     26 :   [2^520]G, [3*2^520]G,  ..., [31*2^520]G
// Computing the negation of a point P = (x, y) is relatively easy:
//     -P = (x, -y).
// So we may assume that for each sub-table we have 32 points instead of 16:
//     [\pm 1*2^20i]G, [\pm 3*2^20i]G, ..., [\pm 31*2^20i]G.
//
// The 521-bit |scalar| is recoded (regular-wNAF encoding) into 105 signed
// digits, each of length 5 bits, as explained in the
// |p521_felem_mul_scalar_rwnaf| function. Namely,
//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_104*2^520,
// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
// scalar we have that scalar = scalar', while in the case of an even
// scalar we have that scalar = scalar' - 1.
//
// To compute the required product, [scalar]G, we may do the following.
// Group the recoded digits of the scalar in 4 groups:
//                                            |   corresponding multiples in
//                  digits                    |   the recoded representation
//   -------------------------------------------------------------------------
//   (0): {s_0, s_4,  s_8, ..., s_100, s_104} |  { 2^0, 2^20, ..., 2^500, 2^520}
//   (1): {s_1, s_5,  s_9, ..., s_101}        |  { 2^5, 2^25, ..., 2^505}
//   (2): {s_2, s_6, s_10, ..., s_102}        |  {2^10, 2^30, ..., 2^510}
//   (3): {s_3, s_7, s_11, ..., s_103}        |  {2^15, 2^35, ..., 2^515}
//        corresponding sub-table lookup      |  {  T0,   T1, ...,   T25,   T26}
//
// The group (0) digits correspond precisely to the multiples of G that are
// held in the 27 precomputed sub-tables, so we may simply read the appropriate
// points from the sub-tables and sum them all up (negating if needed, i.e., if
// a digit s_i is negative, we read the point corresponding to the abs(s_i) and
// negate it before adding it to the sum).
// The remaining three groups (1), (2), and (3), correspond to the multiples
// of G from the sub-tables multiplied additionally by 2^5, 2^10, and 2^15,
// respectively. Therefore, for these groups we may read the appropriate points
// from the table, double them 5, 10, or 15 times, respectively, and add them
// to the final result.
//
// To minimize the number of required doubling operations we process the digits
// of the scalar from left to right. In other words, the algorithm is:
//   1. For group (i) in this order (3, 2, 1, 0):
//   2.   Double the accumulator 5 times except in the first iteration.
//   3.   Read the points corresponding to the group (i) digits from the tables
//        and add them to an accumulator.
//   4. If the scalar is even subtract G from the accumulator.
//
// Note: this function is designed to be constant-time.
void ec_nistp_scalar_mul_base(const ec_nistp_meth *ctx,
                              ec_nistp_felem_limb *x_out,
                              ec_nistp_felem_limb *y_out,
                              ec_nistp_felem_limb *z_out,
                              const EC_SCALAR *scalar) {
  // Regular-wNAF encoding of the scalar.
  int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
  scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);
  size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);

  // We need two point accumulators, so we define them of maximum size
  // to avoid allocation, and just take pointers to individual coordinates.
  // (This cruft will disapear when we refactor point_add/dbl to work with
  // whole points instead of individual coordinates).
  ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
  ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
  ec_nistp_felem_limb *x_res = &res[0];
  ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
  ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
  ec_nistp_felem_limb *x_tmp = &tmp[0];
  ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
  ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];

  // Process the 4 groups of digits starting from group (3) down to group (0).
  for (int i = 3; i >= 0; i--) {
    // Double |res| 5 times in each iteration, except in the first one.
    for (size_t j = 0; i != 3 && j < SCALAR_MUL_WINDOW_SIZE; j++) {
      ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
    }

    // Process the digits in the current group from the most to the least
    // significant one.
    size_t start_idx = ((num_windows - i - 1) / 4) * 4 + i;

    for (int j = start_idx; j >= 0; j -= 4) {
      // For each digit |d| in the current group read the corresponding point
      // from the table and add it to |res|. If |d| is negative, negate
      // the point before adding it to |res|.
      int16_t d = rwnaf[j];
      int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
      d = (d ^ -is_neg) + is_neg;     // d = abs(d)

      int16_t idx = d >> 1;

      // Select the point to add, in constant time.
      size_t point_num_limbs = 2 * ctx->felem_num_limbs;  // Affine points.
      size_t subtable_num_limbs = SCALAR_MUL_TABLE_NUM_POINTS * point_num_limbs;
      size_t table_idx = (j / 4) * subtable_num_limbs;
      const ec_nistp_felem_limb *table = &ctx->scalar_mul_base_table[table_idx];
      select_point_from_table(ctx, tmp, table, idx, 0);

      // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
      ec_nistp_felem ftmp;
      ctx->felem_neg(ftmp, y_tmp);

      cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);

      // Add the point to the accumulator |res|.
      ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 1,
                     x_tmp, y_tmp, ctx->felem_one);
    }
  }

  // Conditionally subtract G if the scalar is even, in constant-time.
  const ec_nistp_felem_limb *x_mp = &ctx->scalar_mul_base_table[0];
  const ec_nistp_felem_limb *y_mp = &ctx->scalar_mul_base_table[ctx->felem_num_limbs];
  ec_nistp_felem ftmp;
  ctx->felem_neg(ftmp, y_mp);

  // Subtract P from the accumulator.
  ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 1, x_mp, ftmp, ctx->felem_one);

  // Select |res| or |res - P| based on parity of the scalar.
  ec_nistp_felem_limb t = scalar->words[0] & 1;
  cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
  cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
  cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
}

// Computes [g_scalar]G + [p_scalar]P, where G is the base point of the curve
// curve, and P is the given point (x_p, y_p, z_p).
//
// Both scalar products are computed by the same "textbook" wNAF method,
// with w = 5 for g_scalar and w = 5 for p_scalar.
// For the base point G product we use the first sub-table of the precomputed
// table, while for P we generate the table on-the-fly. The tables hold the
// first 16 odd multiples of G or P:
//     g_pre_comp = {[1]G, [3]G, ..., [31]G},
//     p_pre_comp = {[1]P, [3]P, ..., [31]P}.
// Computing the negation of a point P = (x, y) is relatively easy:
//     -P = (x, -y).
// So we may assume that we also have the negatives of the points in the tables.
//
// The scalars are recoded by the textbook wNAF method to digits, where a digit
// is either a zero or an odd integer in [-31, 31]. The method guarantees that
// each non-zero digit is followed by at least four zeroes.
//
// The result [g_scalar]G + [p_scalar]P is computed by the following algorithm:
//     1. Initialize the accumulator with the point-at-infinity.
//     2. For i starting from 521 down to 0:
//     3.   Double the accumulator (doubling can be skipped while the
//          accumulator is equal to the point-at-infinity).
//     4.   Read from |p_pre_comp| the point corresponding to the i-th digit of
//          p_scalar, negate it if the digit is negative, and add it to the
//          accumulator.
//     5.   Read from |g_pre_comp| the point corresponding to the i-th digit of
//          g_scalar, negate it if the digit is negative, and add it to the
//          accumulator.
// Note: this function is NOT constant-time.
void ec_nistp_scalar_mul_public(const ec_nistp_meth *ctx,
                                ec_nistp_felem_limb *x_out,
                                ec_nistp_felem_limb *y_out,
                                ec_nistp_felem_limb *z_out,
                                const EC_SCALAR *g_scalar,
                                const ec_nistp_felem_limb *x_p,
                                const ec_nistp_felem_limb *y_p,
                                const ec_nistp_felem_limb *z_p,
                                const EC_SCALAR *p_scalar) {

  const size_t felem_num_bytes = ctx->felem_num_limbs * sizeof(ec_nistp_felem_limb);

  // Table of multiples of P.
  ec_nistp_felem_limb p_table[SCALAR_MUL_TABLE_MAX_NUM_FELEM_LIMBS];
  generate_table(ctx, p_table, x_p, y_p, z_p);
  const size_t p_point_num_limbs = 3 * ctx->felem_num_limbs; // Projective.

  // Table of multiples of G.
  const ec_nistp_felem_limb *g_table = ctx->scalar_mul_base_table;
  const size_t g_point_num_limbs = 2 * ctx->felem_num_limbs; // Affine.

  // Recode the scalars.
  int8_t p_wnaf[SCALAR_MUL_MAX_SCALAR_BITS + 1] = {0};
  int8_t g_wnaf[SCALAR_MUL_MAX_SCALAR_BITS + 1] = {0};
  ec_compute_wNAF(p_wnaf, p_scalar, ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
  ec_compute_wNAF(g_wnaf, g_scalar, ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);

  // In the beginning res is set to point-at-infinity, so we set the flag.
  int16_t res_is_inf = 1;
  int16_t d, is_neg, idx;
  ec_nistp_felem ftmp;

  for (int i = ctx->felem_num_bits; i >= 0; i--) {

    // If |res| is point-at-infinity there is no point in doubling so skip it.
    if (!res_is_inf) {
      ctx->point_dbl(x_out, y_out, z_out, x_out, y_out, z_out);
    }

    // Process the p_scalar digit.
    d = p_wnaf[i];
    if (d != 0) {
      is_neg = d < 0 ? 1 : 0;
      idx = (is_neg) ? (-d - 1) >> 1 : (d - 1) >> 1;

      if (res_is_inf) {
        // If |res| is point-at-infinity there is no need to add the new point,
        // we can simply copy it.
        const size_t table_idx = idx * p_point_num_limbs;
        OPENSSL_memcpy(x_out, &p_table[table_idx], felem_num_bytes);
        OPENSSL_memcpy(y_out, &p_table[table_idx + ctx->felem_num_limbs], felem_num_bytes);
        OPENSSL_memcpy(z_out, &p_table[table_idx + ctx->felem_num_limbs * 2], felem_num_bytes);
        res_is_inf = 0;
      } else {
        // Otherwise, add to the accumulator either the point at position idx
        // in the table or its negation.
        const ec_nistp_felem_limb *y_tmp;
        y_tmp = &p_table[idx * p_point_num_limbs + ctx->felem_num_limbs];
        if (is_neg) {
          ctx->felem_neg(ftmp, y_tmp);
          y_tmp = ftmp;
        }
        ctx->point_add(x_out, y_out, z_out, x_out, y_out, z_out, 0,
                       &p_table[idx * p_point_num_limbs],
                       y_tmp,
                       &p_table[idx * p_point_num_limbs + ctx->felem_num_limbs * 2]);
      }
    }

    /* // Process the g_scalar digit. */
    d = g_wnaf[i];
    if (d != 0) {
      is_neg = d < 0 ? 1 : 0;
      idx = (is_neg) ? (-d - 1) >> 1 : (d - 1) >> 1;

      if (res_is_inf) {
        // If |res| is point-at-infinity there is no need to add the new point,
        // we can simply copy it.
        const size_t table_idx = idx * g_point_num_limbs;
        OPENSSL_memcpy(x_out, &g_table[table_idx], felem_num_bytes);
        OPENSSL_memcpy(y_out, &g_table[table_idx + ctx->felem_num_limbs], felem_num_bytes);
        OPENSSL_memcpy(z_out, ctx->felem_one, felem_num_bytes);
        res_is_inf = 0;
      } else {
        // Otherwise, add to the accumulator either the point at position idx
        // in the table or its negation.
        const ec_nistp_felem_limb *y_tmp ;
        y_tmp = &g_table[idx * g_point_num_limbs + ctx->felem_num_limbs];
        if (is_neg) {
          ctx->felem_neg(ftmp, &g_table[idx * g_point_num_limbs + ctx->felem_num_limbs]);
          y_tmp = ftmp;
        }
        ctx->point_add(x_out, y_out, z_out, x_out, y_out, z_out, 1,
                       &g_table[idx * g_point_num_limbs], y_tmp, ctx->felem_one);
      }
    }
  }
}

void ec_nistp_point_to_coordinates(ec_nistp_felem_limb *x_out,
                                   ec_nistp_felem_limb *y_out,
                                   ec_nistp_felem_limb *z_out,
                                   const ec_nistp_felem_limb *xyz_in,
                                   size_t num_limbs_per_coord) {
  size_t num_bytes_per_coord = num_limbs_per_coord * sizeof(ec_nistp_felem_limb);
  OPENSSL_memcpy(x_out, xyz_in, num_bytes_per_coord);
  OPENSSL_memcpy(y_out, &xyz_in[num_limbs_per_coord], num_bytes_per_coord);
  OPENSSL_memcpy(z_out, &xyz_in[num_limbs_per_coord * 2], num_bytes_per_coord);
}

void ec_nistp_coordinates_to_point(ec_nistp_felem_limb *xyz_out,
                                   const ec_nistp_felem_limb *x_in,
                                   const ec_nistp_felem_limb *y_in,
                                   const ec_nistp_felem_limb *z_in,
                                   size_t num_limbs_per_coord) {
  size_t num_bytes_per_coord = num_limbs_per_coord * sizeof(ec_nistp_felem_limb);
  OPENSSL_memcpy(xyz_out, x_in, num_bytes_per_coord);
  OPENSSL_memcpy(&xyz_out[num_limbs_per_coord], y_in, num_bytes_per_coord);
  OPENSSL_memcpy(&xyz_out[num_limbs_per_coord * 2], z_in, num_bytes_per_coord);
}