cli/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/ec/p384.c

// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC

#include <openssl/bn.h>
#include <openssl/ec.h>
#include <openssl/err.h>
#include <openssl/mem.h>

#include "../bn/internal.h"
#include "../cpucap/internal.h"
#include "../delocate.h"
#include "internal.h"
#include "ec_nistp.h"

#if !defined(OPENSSL_SMALL)

#if defined(EC_NISTP_USE_S2N_BIGNUM)
#  include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
#else
#  if defined(EC_NISTP_USE_64BIT_LIMB)
#    include "../../../third_party/fiat/p384_64.h"
#  else
#    include "../../../third_party/fiat/p384_32.h"
#  endif
#endif

#if defined(EC_NISTP_USE_64BIT_LIMB)

#define P384_NLIMBS (6)
typedef uint64_t p384_limb_t;
typedef uint64_t p384_felem[P384_NLIMBS];
static const p384_felem p384_felem_one = {
    0xffffffff00000001, 0xffffffff, 0x1, 0x0, 0x0, 0x0};

#else  // 64BIT; else 32BIT

#define P384_NLIMBS (12)
typedef uint32_t p384_limb_t;
typedef uint32_t p384_felem[P384_NLIMBS];
static const p384_felem p384_felem_one = {
    0x1, 0xffffffff, 0xffffffff, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};

#endif  // 64BIT

#if defined(EC_NISTP_USE_S2N_BIGNUM)

#define p384_felem_add(out, in0, in1)   bignum_add_p384(out, in0, in1)
#define p384_felem_sub(out, in0, in1)   bignum_sub_p384(out, in0, in1)
#define p384_felem_opp(out, in0)        bignum_neg_p384(out, in0)
#define p384_felem_to_bytes(out, in0)   bignum_tolebytes_6(out, in0)
#define p384_felem_from_bytes(out, in0) bignum_fromlebytes_6(out, in0)
#define p384_felem_to_mont(out, in0)    bignum_tomont_p384_selector(out, in0)
#define p384_felem_from_mont(out, in0)  bignum_deamont_p384_selector(out, in0)
#define p384_felem_mul(out, in0, in1)   bignum_montmul_p384_selector(out, in0, in1)
#define p384_felem_sqr(out, in0)        bignum_montsqr_p384_selector(out, in0)

static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
  return bignum_nonzero_6(in1);
}

#else // EC_NISTP_USE_S2N_BIGNUM

// Fiat-crypto implementation of field arithmetic
#define p384_felem_add(out, in0, in1)   fiat_p384_add(out, in0, in1)
#define p384_felem_sub(out, in0, in1)   fiat_p384_sub(out, in0, in1)
#define p384_felem_opp(out, in0)        fiat_p384_opp(out, in0)
#define p384_felem_mul(out, in0, in1)   fiat_p384_mul(out, in0, in1)
#define p384_felem_sqr(out, in0)        fiat_p384_square(out, in0)
#define p384_felem_to_mont(out, in0)    fiat_p384_to_montgomery(out, in0)
#define p384_felem_from_mont(out, in0)  fiat_p384_from_montgomery(out, in0)
#define p384_felem_to_bytes(out, in0)   fiat_p384_to_bytes(out, in0)
#define p384_felem_from_bytes(out, in0) fiat_p384_from_bytes(out, in0)

static p384_limb_t p384_felem_nz(const p384_limb_t in1[P384_NLIMBS]) {
  p384_limb_t ret;
  fiat_p384_nonzero(&ret, in1);
  return ret;
}

#endif // EC_NISTP_USE_S2N_BIGNUM

// The wrapper functions are needed for FIPS static build.
// Otherwise, initializing ec_nistp_meth with pointers to s2n-bignum
// functions directly generates :got: references that are also thought
// to be local_target by the delocator.
static inline void p384_felem_add_wrapper(ec_nistp_felem_limb *c,
                                          const ec_nistp_felem_limb *a,
                                          const ec_nistp_felem_limb *b) {
  p384_felem_add(c, a, b);
}

static inline void p384_felem_sub_wrapper(ec_nistp_felem_limb *c,
                                          const ec_nistp_felem_limb *a,
                                          const ec_nistp_felem_limb *b) {
  p384_felem_sub(c, a, b);
}

static inline void p384_felem_neg_wrapper(ec_nistp_felem_limb *c,
                                          const ec_nistp_felem_limb *a) {
  p384_felem_opp(c, a);
}

static void p384_from_generic(p384_felem out, const EC_FELEM *in) {
#ifdef OPENSSL_BIG_ENDIAN
  uint8_t tmp[P384_EC_FELEM_BYTES];
  bn_words_to_little_endian(tmp, P384_EC_FELEM_BYTES, in->words, P384_EC_FELEM_WORDS);
  p384_felem_from_bytes(out, tmp);
#else
  p384_felem_from_bytes(out, (const uint8_t *)in->words);
#endif
}

static void p384_to_generic(EC_FELEM *out, const p384_felem in) {
  // This works because 384 is a multiple of 64, so there are no excess bytes to
  // zero when rounding up to |BN_ULONG|s.
  OPENSSL_STATIC_ASSERT(
      384 / 8 == sizeof(BN_ULONG) * ((384 + BN_BITS2 - 1) / BN_BITS2),
      p384_felem_to_bytes_leaves_bytes_uninitialized);
#ifdef OPENSSL_BIG_ENDIAN
  uint8_t tmp[P384_EC_FELEM_BYTES];
  p384_felem_to_bytes(tmp, in);
  bn_little_endian_to_words(out->words, P384_EC_FELEM_WORDS, tmp, P384_EC_FELEM_BYTES);
#else
  p384_felem_to_bytes((uint8_t *)out->words, in);
#endif
}

static void p384_from_scalar(p384_felem out, const EC_SCALAR *in) {
#ifdef OPENSSL_BIG_ENDIAN
  uint8_t tmp[P384_EC_FELEM_BYTES];
  bn_words_to_little_endian(tmp, P384_EC_FELEM_BYTES, in->words, P384_EC_FELEM_WORDS);
  p384_felem_from_bytes(out, tmp);
#else
  p384_felem_from_bytes(out, (const uint8_t *)in->words);
#endif
}

// p384_inv_square calculates |out| = |in|^{-2}
//
// Based on Fermat's Little Theorem:
//   a^p = a (mod p)
//   a^{p-1} = 1 (mod p)
//   a^{p-3} = a^{-2} (mod p)
// p = 2^384 - 2^128 - 2^96 + 2^32 - 1
// Hexadecimal representation of p − 3:
// p-3 = ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe
//       ffffffff 00000000 00000000 fffffffc
static void p384_inv_square(p384_felem out,
                            const p384_felem in) {
#if defined(EC_NISTP_USE_S2N_BIGNUM)
  ec_nistp_felem_limb in_sqr[P384_NLIMBS];
  p384_felem_sqr(in_sqr, in);
  bignum_montinv_p384(out, in_sqr);
#else
  // This implements the addition chain described in
  // https://briansmith.org/ecc-inversion-addition-chains-01#p384_field_inversion
  // The side comments show the value of the exponent:
  // squaring the element => doubling the exponent
  // multiplying by an element => adding to the exponent the power of that element
  p384_felem x2, x3, x6, x12, x15, x30, x60, x120;
  p384_felem_sqr(x2, in);   // 2^2 - 2^1
  p384_felem_mul(x2, x2, in);  // 2^2 - 2^0

  p384_felem_sqr(x3, x2);   // 2^3 - 2^1
  p384_felem_mul(x3, x3, in);  // 2^3 - 2^0

  p384_felem_sqr(x6, x3);
  for (int i = 1; i < 3; i++) {
    p384_felem_sqr(x6, x6);
  }                           // 2^6 - 2^3
  p384_felem_mul(x6, x6, x3);  // 2^6 - 2^0

  p384_felem_sqr(x12, x6);
  for (int i = 1; i < 6; i++) {
    p384_felem_sqr(x12, x12);
  }                             // 2^12 - 2^6
  p384_felem_mul(x12, x12, x6);  // 2^12 - 2^0

  p384_felem_sqr(x15, x12);
  for (int i = 1; i < 3; i++) {
    p384_felem_sqr(x15, x15);
  }                             // 2^15 - 2^3
  p384_felem_mul(x15, x15, x3);  // 2^15 - 2^0

  p384_felem_sqr(x30, x15);
  for (int i = 1; i < 15; i++) {
    p384_felem_sqr(x30, x30);
  }                              // 2^30 - 2^15
  p384_felem_mul(x30, x30, x15);  // 2^30 - 2^0

  p384_felem_sqr(x60, x30);
  for (int i = 1; i < 30; i++) {
    p384_felem_sqr(x60, x60);
  }                              // 2^60 - 2^30
  p384_felem_mul(x60, x60, x30);  // 2^60 - 2^0

  p384_felem_sqr(x120, x60);
  for (int i = 1; i < 60; i++) {
    p384_felem_sqr(x120, x120);
  }                                // 2^120 - 2^60
  p384_felem_mul(x120, x120, x60);  // 2^120 - 2^0

  p384_felem ret;
  p384_felem_sqr(ret, x120);
  for (int i = 1; i < 120; i++) {
    p384_felem_sqr(ret, ret);
  }                                // 2^240 - 2^120
  p384_felem_mul(ret, ret, x120);   // 2^240 - 2^0

  for (int i = 0; i < 15; i++) {
    p384_felem_sqr(ret, ret);
  }                                // 2^255 - 2^15
  p384_felem_mul(ret, ret, x15);    // 2^255 - 2^0

  // Why (1 + 30) in the loop?
  // This is as expressed in:
  //   https://briansmith.org/ecc-inversion-addition-chains-01#p384_field_inversion
  // My guess is to say that we're going to shift 31 bits, but this time we
  // won't add x31 to make all the new bits 1s, as was done in previous steps,
  // but we're going to add x30 so there will be 255 1s, then a 0, then 30 1s
  // to form this pattern:
  //   ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff fffffffe ffffffff
  // (the last 2 1s are appended in the following step).
  for (int i = 0; i < (1 + 30); i++) {
    p384_felem_sqr(ret, ret);
  }                                // 2^286 - 2^31
  p384_felem_mul(ret, ret, x30);    // 2^286 - 2^30 - 2^0

  p384_felem_sqr(ret, ret);
  p384_felem_sqr(ret, ret);      // 2^288 - 2^32 - 2^2
  p384_felem_mul(ret, ret, x2);     // 2^288 - 2^32 - 2^0

  // Why not 94 instead of (64 + 30) in the loop?
  // Similarly to the comment above, there is a shift of 94 bits
  // but what will be added is x30, which will cause 64 of those bits
  // to be 64 0s and 30 1s to complete the pattern above with:
  //   00000000 00000000 fffffffc
  // (the last 2 0s are appended by the last 2 shifts).
  for (int i = 0; i < (64 + 30); i++) {
    p384_felem_sqr(ret, ret);
  }                                // 2^382 - 2^126 - 2^94
  p384_felem_mul(ret, ret, x30);    // 2^382 - 2^126 - 2^94 + 2^30 - 2^0

  p384_felem_sqr(ret, ret);
  p384_felem_sqr(out, ret);      // 2^384 - 2^128 - 2^96 + 2^32 - 2^2 = p - 3
#endif
}

static void p384_point_double(p384_felem x_out,
                              p384_felem y_out,
                              p384_felem z_out,
                              const p384_felem x_in,
                              const p384_felem y_in,
                              const p384_felem z_in) {
#if defined(EC_NISTP_USE_S2N_BIGNUM)
  ec_nistp_felem_limb in[P384_NLIMBS * 3];
  ec_nistp_felem_limb out[P384_NLIMBS * 3];
  ec_nistp_coordinates_to_point(in, x_in, y_in, z_in, P384_NLIMBS);
  p384_montjdouble_selector(out, in);
  ec_nistp_point_to_coordinates(x_out, y_out, z_out, out, P384_NLIMBS);
#else
  ec_nistp_point_double(p384_methods(), x_out, y_out, z_out, x_in, y_in, z_in);
#endif
}

// p384_point_add calculates (x1, y1, z1) + (x2, y2, z2)
//
// The method is taken from:
//   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl
// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
//
// Coq transcription and correctness proof:
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L467>
// <https://github.com/davidben/fiat-crypto/blob/c7b95f62b2a54b559522573310e9b487327d219a/src/Curves/Weierstrass/Jacobian.v#L544>
static void p384_point_add(p384_felem x3, p384_felem y3, p384_felem z3,
                           const p384_felem x1,
                           const p384_felem y1,
                           const p384_felem z1,
                           const int mixed,
                           const p384_felem x2,
                           const p384_felem y2,
                           const p384_felem z2) {
  ec_nistp_point_add(p384_methods(), x3, y3, z3, x1, y1, z1, mixed, x2, y2, z2);
}

#include "p384_table.h"

#if defined(EC_NISTP_USE_S2N_BIGNUM)
DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
    out->felem_num_limbs = P384_NLIMBS;
    out->felem_num_bits = 384;
    out->felem_add = p384_felem_add_wrapper;
    out->felem_sub = p384_felem_sub_wrapper;
    out->felem_mul = bignum_montmul_p384_selector;
    out->felem_sqr = bignum_montsqr_p384_selector;
    out->felem_neg = p384_felem_neg_wrapper;
    out->felem_nz  = p384_felem_nz;
    out->felem_one = p384_felem_one;
    out->point_dbl = p384_point_double;
    out->point_add = p384_point_add;
    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
}
#else
DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
    out->felem_num_limbs = P384_NLIMBS;
    out->felem_num_bits = 384;
    out->felem_add = fiat_p384_add;
    out->felem_sub = fiat_p384_sub;
    out->felem_mul = fiat_p384_mul;
    out->felem_sqr = fiat_p384_square;
    out->felem_neg = fiat_p384_opp;
    out->felem_nz  = p384_felem_nz;
    out->felem_one = p384_felem_one;
    out->point_dbl = p384_point_double;
    out->point_add = p384_point_add;
    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
}
#endif

// OPENSSL EC_METHOD FUNCTIONS

// Takes the Jacobian coordinates (X, Y, Z) of a point and returns:
//   (X', Y') = (X/Z^2, Y/Z^3).
static int ec_GFp_nistp384_point_get_affine_coordinates(
    const EC_GROUP *group, const EC_JACOBIAN *point,
    EC_FELEM *x_out, EC_FELEM *y_out) {

  if (constant_time_declassify_w(ec_GFp_simple_is_at_infinity(group, point))) {
    OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY);
    return 0;
  }

  p384_felem z1, z2;
  p384_from_generic(z1, &point->Z);
  p384_inv_square(z2, z1);

  if (x_out != NULL) {
    p384_felem x;
    p384_from_generic(x, &point->X);
    p384_felem_mul(x, x, z2);
    p384_to_generic(x_out, x);
  }

  if (y_out != NULL) {
    p384_felem y;
    p384_from_generic(y, &point->Y);
    p384_felem_sqr(z2, z2);  // z^-4
    p384_felem_mul(y, y, z1);   // y * z
    p384_felem_mul(y, y, z2);   // y * z^-3
    p384_to_generic(y_out, y);
  }

  return 1;
}

static void ec_GFp_nistp384_add(const EC_GROUP *group, EC_JACOBIAN *r,
                                const EC_JACOBIAN *a, const EC_JACOBIAN *b) {
  p384_felem x1, y1, z1, x2, y2, z2;
  p384_from_generic(x1, &a->X);
  p384_from_generic(y1, &a->Y);
  p384_from_generic(z1, &a->Z);
  p384_from_generic(x2, &b->X);
  p384_from_generic(y2, &b->Y);
  p384_from_generic(z2, &b->Z);
  p384_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2);
  p384_to_generic(&r->X, x1);
  p384_to_generic(&r->Y, y1);
  p384_to_generic(&r->Z, z1);
}

static void ec_GFp_nistp384_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
                                const EC_JACOBIAN *a) {
  p384_felem x, y, z;
  p384_from_generic(x, &a->X);
  p384_from_generic(y, &a->Y);
  p384_from_generic(z, &a->Z);
  p384_point_double(x, y, z, x, y, z);
  p384_to_generic(&r->X, x);
  p384_to_generic(&r->Y, y);
  p384_to_generic(&r->Z, z);
}

// The calls to from/to_generic are needed for the case
// when BORINGSSL_HAS_UINT128 is undefined, i.e. p384_32.h fiat code is used;
// while OPENSSL_64_BIT is defined, i.e. BN_ULONG is uint64_t
static void ec_GFp_nistp384_mont_felem_to_bytes(
  const EC_GROUP *group, uint8_t *out, size_t *out_len, const EC_FELEM *in) {

  size_t len = BN_num_bytes(&group->field.N);
  EC_FELEM felem_tmp;
  p384_felem tmp;
  p384_from_generic(tmp, in);
  p384_felem_from_mont(tmp, tmp);
  p384_to_generic(&felem_tmp, tmp);

  bn_words_to_big_endian(out, len, felem_tmp.words, group->order.N.width);

  *out_len = len;
}

static int ec_GFp_nistp384_mont_felem_from_bytes(
  const EC_GROUP *group, EC_FELEM *out, const uint8_t *in, size_t len) {

  EC_FELEM felem_tmp;
  p384_felem tmp;
  // This function calls bn_cmp_words_consttime
  if (!ec_GFp_simple_felem_from_bytes(group, &felem_tmp, in, len)) {
    return 0;
  }
  p384_from_generic(tmp, &felem_tmp);
  p384_felem_to_mont(tmp, tmp);
  p384_to_generic(out, tmp);
  return 1;
}

static int ec_GFp_nistp384_cmp_x_coordinate(const EC_GROUP *group,
                                            const EC_JACOBIAN *p,
                                            const EC_SCALAR *r) {
  if (ec_GFp_simple_is_at_infinity(group, p)) {
    return 0;
  }

  // We wish to compare X/Z^2 with r. This is equivalent to comparing X with
  // r*Z^2. Note that X and Z are represented in Montgomery form, while r is
  // not.
  p384_felem Z2_mont;
  p384_from_generic(Z2_mont, &p->Z);
  p384_felem_mul(Z2_mont, Z2_mont, Z2_mont);

  p384_felem r_Z2;
  p384_from_scalar(r_Z2, r);  // r < order < p, so this is valid.
  p384_felem_mul(r_Z2, r_Z2, Z2_mont);

  p384_felem X;
  p384_from_generic(X, &p->X);
  p384_felem_from_mont(X, X);

  if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
    return 1;
  }

  // During signing the x coefficient is reduced modulo the group order.
  // Therefore there is a small possibility, less than 2^189/2^384 = 1/2^195,
  // that group_order < p.x < p.
  // In that case, we need not only to compare against |r| but also to
  // compare against r+group_order.
  assert(group->field.N.width == group->order.N.width);
  EC_FELEM tmp;
  BN_ULONG carry =
      bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width);
  if (carry == 0 &&
      bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) {
    p384_from_generic(r_Z2, &tmp);
    p384_felem_mul(r_Z2, r_Z2, Z2_mont);
    if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) {
      return 1;
    }
  }

  return 0;
}

// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                      const EC_JACOBIAN *p,
                                      const EC_SCALAR *scalar) {

  p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};

  p384_from_generic(tmp[0], &p->X);
  p384_from_generic(tmp[1], &p->Y);
  p384_from_generic(tmp[2], &p->Z);

#if defined(EC_NISTP_USE_S2N_BIGNUM)
  p384_montjscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
#else
  ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
#endif

  p384_to_generic(&r->X, res[0]);
  p384_to_generic(&r->Y, res[1]);
  p384_to_generic(&r->Z, res[2]);
}

// Multiplication of the base point G of P-384 curve with the given scalar.
static void ec_GFp_nistp384_point_mul_base(const EC_GROUP *group,
                                           EC_JACOBIAN *r,
                                           const EC_SCALAR *scalar) {
  p384_felem res[3] = {{0}, {0}, {0}};

  ec_nistp_scalar_mul_base(p384_methods(), res[0], res[1], res[2], scalar);

  p384_to_generic(&r->X, res[0]);
  p384_to_generic(&r->Y, res[1]);
  p384_to_generic(&r->Z, res[2]);
}

// Computes [g_scalar]G + [p_scalar]P, where G is the base point of the P-384
// curve, and P is the given point |p|.
// Note: this function is NOT constant-time.
static void ec_GFp_nistp384_point_mul_public(const EC_GROUP *group,
                                             EC_JACOBIAN *r,
                                             const EC_SCALAR *g_scalar,
                                             const EC_JACOBIAN *p,
                                             const EC_SCALAR *p_scalar) {

  p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}};

  p384_from_generic(tmp[0], &p->X);
  p384_from_generic(tmp[1], &p->Y);
  p384_from_generic(tmp[2], &p->Z);

  ec_nistp_scalar_mul_public(p384_methods(), res[0], res[1], res[2], g_scalar, tmp[0], tmp[1], tmp[2], p_scalar);

  p384_to_generic(&r->X, res[0]);
  p384_to_generic(&r->Y, res[1]);
  p384_to_generic(&r->Z, res[2]);
}

DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp384_method) {
  out->point_get_affine_coordinates =
      ec_GFp_nistp384_point_get_affine_coordinates;
  out->jacobian_to_affine_batch =
      ec_GFp_mont_jacobian_to_affine_batch;     // needed for TrustToken tests
  out->add = ec_GFp_nistp384_add;
  out->dbl = ec_GFp_nistp384_dbl;
  out->mul = ec_GFp_nistp384_point_mul;
  out->mul_base = ec_GFp_nistp384_point_mul_base;
  out->mul_public = ec_GFp_nistp384_point_mul_public;
  out->mul_batch = ec_GFp_mont_mul_batch;       // needed for TrustToken tests
  out->mul_public_batch = ec_GFp_mont_mul_public_batch;
  out->init_precomp = ec_GFp_mont_init_precomp; // needed for TrustToken tests
  out->mul_precomp = ec_GFp_mont_mul_precomp;   // needed for TrustToken tests
  out->felem_mul = ec_GFp_mont_felem_mul;
  out->felem_sqr = ec_GFp_mont_felem_sqr;
  out->felem_to_bytes = ec_GFp_nistp384_mont_felem_to_bytes;
  out->felem_from_bytes = ec_GFp_nistp384_mont_felem_from_bytes;
  out->felem_reduce = ec_GFp_mont_felem_reduce; // needed for ECTest.HashToCurve
  out->felem_exp = ec_GFp_mont_felem_exp;       // needed for ECTest.HashToCurve
  out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery;
  out->scalar_to_montgomery_inv_vartime =
      ec_simple_scalar_to_montgomery_inv_vartime;
  out->cmp_x_coordinate = ec_GFp_nistp384_cmp_x_coordinate;
}

// ----------------------------------------------------------------------------
//  Analysis of the doubling case occurrence in the Joye-Tunstall recoding:
//  p384_felem_mul_scalar_rwnaf()
// ----------------------------------------------------------------------------
//
// The JT scalar recoding is Algorithm 6: (Odd) Signed-Digit Recoding Algorithm in
// Joye, Tunstall, "Exponent Recoding and Regular Exponentiation Algorithms",
// AfricaCrypt 2009, available from
// https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.477.1245&rep=rep1&type=pdf
//
// We write the algorithm using variables similar to those used in the code and
// in the proof detailed in util.c (t_i in the algorithm below is d in
// p384_felem_mul_scalar_rwnaf()):
//
// Input: k: odd scalar, where k = (b_{l-1}, ..., b_1, b_0) in binary form,
//        w: window width
// Output: k = (t_{h-1}, ..., t_1, t_0)
//         with t_i \in {\pm 1, \pm 3, ..., \pm (2^{w} - 1)},
//         h = ceil(t/w), i.e. t_i are positive and negative odd digits
//         which absolute value is less than (2^{w} - 1).
// i := 0
// j := 0
// while (k > 2^w):
//   window := (b_{j+w}, ..., b_j)   # (w+1)-bit window in k where
//                                   # the least significant bit is b_j
//   t_i := window - 2^w
//   k := k - t_i
//   k := k / 2^w          # k >> w
//   i += 1
//   j += w
// t_{h-1} := k
//
// Note that if b_{j+w} = 0, t_i will be negative;
// otherwise, if b_{j+w} = 1, t_i will be positive.
//
// The algorithm recodes the least (w+1) bits into a (odd) digit in the range
// [-(2^{w}-1), (2^{w}-1)] by subtracting 2^w from that digit and adding it back
// to the remaining bits of the scalar. This ensures that, after the w-bit right
// shift, the next least significant bit is 1, i.e. next digit is odd.
//
// In the following we will show that the non-trivial doubling case in
// single-point left-to-right windowed (or m-ary, m = 2^w) scalar multiplication
// may occur if and only if the (2^w)th bit of the group order is 1.  This only
// holds if the scalar is fully reduced and the group order is a prime that is
// much larger than 2^{w+1}.
//
// PROOF:
//
// Let n be the group order. Let l be the number of bits needed to represent n.
// Assume there exists some 0 <= k < n such that signed w-bit windowed
// multiplication hits the doubling case.
//
// Windowed multiplication consists of iterating over the digits t_i defined
// above by the algorithm from most to least significant. At iteration i
// (for i = ..., 3w, 2w, w, 0, starting from the most significant window), we:
//
//  1. Double the accumulator A, w times. Let A_i be the value of A at this
//     point, and it corresponds to a value [a_i]P.
//
//  2. Set A to T_i + A_i, where T_i is a precomputed multiple of P, [t_i]P

// From the algorithm steps we can see that the current digit
// t_i = (b_{w+i} ... b_i) - 2^w, b_i = 1     => -2^w < t_i < 2^w, t_i: odd
// which can also be written using C notation as
// t_i = [(k >> i) & ((1<<(w+1)) - 1)] - (1 << w)     -- (1)
//
// and the accumulator value
// a_i = b_{l-1} ... b_{i+w+1} 1
// when written as C notation
// a_i = (k >> (i+w+1)) << (w+1)) + (1 << w)          -- (2)
//
// Similarly to the recoding in util.c, a_i is bounded by
// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up
// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for
// all groups. That would imply the subsequent a_i is zero, which means all
// terms thus far were zero.)
//
// Let j be the index such that A_j = T_j ≠ ∞. We have a_j = t_j (mod n). We now
// determine the value of a_j - t_j, which must be divisible by n.
// Our bounds on a_j and t_j imply a_j - t_j is 0 or n.
// If it is 0, a_j = t_j. However, 2^w divides a_j and -2^w < t_i < 2^w, so this
// can only happen if a_j = t_j = 0, which is a trivial doubling.
// Therefore, a_j - t_j = n.
//
// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then,
//
//   n = a_j - t_j = (k >> (j+w+1)) << (w+1)) + (1 << w) - t_j
//                 = k/2^j + 2^w - t_j
//                 < n/2^w + 2^w + 2^w-1
//
// n is much larger than 2^{w+1}, so this is impossible. Thus, j = 0: only the
// final addition may hit the doubling case.
//
// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L
// such that k_H is the contribution from b_(l-1) .. b_{w+1},
// k_M is the contribution from b_w,
// and k_L is the contribution from b_(w-1) ... b_0.
// That is:
//
// - 2^{w+1} divides k_H
// - k_M is 0 or 2^w
// - 0 <= k_L < 2^w
//
// Divide n into n_H + n_M + n_L similarly.
// From (1) and (2), we have
//
// t_0 = (k_M + k_L) - 2^w
// a_0 = k_H + 2^w
//
// We try to find t_0 and a_0 such that
//
//               n = a_0 - t_0
// n_H + n_M + n_L = k_H + 2^w - (k_M + k_L - 2^w)
//                 = k_H + 2^{w+1} - (k_M + k_L)
//
// We know that k_H <= n_H.
//
// If k_H < n_H, then k_H <= n_H - 2^{w+1} (Note that 2^{w+1} divides both k_H
// and n_H). Then we would have
//
// n_H + n_M + n_L <= n_H - 2^{w+1} + 2^{w+1} - (k_M + k_L)
//       n_M + n_L <= - (k_M + k_L)
//
// Contradiction. Thus,
//
//          k_H = n_H                    -- (3)
// => n_M + n_L = 2^{w+1} - (k_M + k_L)  -- (4)
//
// We also have n > k; hence,
// n_M + n_L > k_M + k_L                 -- (5)
//
// For (3), (4) and (5) to hold,
// n_M = 2^w, k_M = 0.
//
// Otherwise, if n_M = 0 and k_M = 0
// n_L = 2^{w+1} - k_L
// n_L >= 2^{w+1} - (2^w - 1)
// n_L >= 2^w + 1
// Contradiction since n_L < 2^w.
//
// And if n_M = 0 and k_M = 2^w, (5) would not hold.
//
// Since n_M = 2^w, n_L >= 1, k_L >= 1, from (4) we have
//  k_M + k_L = 2^{w+1} - (n_M + n_L)
//           <= 2^{w+1} - 2^w - 1
//           <= 2^{w} - 1
// => k_M = 0
//
// Putting this together, from the group order of the curve, n, we can construct
// the scalar, k, that would incur a doubling in the last iteration as:
//
// if n_M = 2^w,
// k_H = n_H and
// k_M + k_L = 2^{w+1} - (n_M + n_L)
//
// COMMON CURVES:
//
// The group orders for common curves end in the following bit patterns:
//
//   P-521: ...00001001; w = 4, 5, 6, 7 are okay
//   P-384: ...01110011; w = 2, 3, 7    are okay
//   P-256: ...01010001; w = 2, 3, 5, 7 are okay
//
//
// CAN DOUBLING OCCUR IN RIGHT-TO-LEFT ALGORITHMS OR COMB ALGORITHMS?
//
// This question was answered empirically for P-384 group order n, w = 5,
// by asking:
// Is there a value d_76, such that
// - d_{76} * 2^{380} - a_{76} = n and
// - d_{76} * 2^{380} + a_{76} = k < n ?
//
// Setting
// d_76 = 0xf
// a_76 = (d_76 << 380) - n =
// -0xfffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973
// k = a_76 + (d_76 << 380) =
// 0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
//
// n-k =
// 0x1fffffffffffffffffffffffffffffffffffffffffffffff8ec69b03e86e5bbeb0341b6491614ef5d9d832d5998a52e6
// => 0 < k < n
//
// -(1<<380)-a_76 = -0x389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
// => -2^380 < a_76 < 2^380
//
// This shows that such a k value exists.
//
// This resulted in modifying the comb algorithm used in
// ec_GFp_nistp384_point_mul_base() to proceed in a left-to-right fashion in
// order to add the least significant digit in the last iteration.
//
// We can probably construct values of k that would incur doubling for whenever
// any of the higher digits, t_{j-1}, (down to the middle digit, roughly) is
// added last. This is because the upper half of the group order of P-384 is all
// 1s, therefore we can find a value k < n, having a 0 at the (j*w)th bit which
// would become 1 in the recoding of t_j (being the least significant bit in
// t_j) and making t_{j-1} a negative digit. Hence, the difference between the
// accumulator value containing all digits and t_{j-1} * 2^{(j-1)*w} can be n.
//
// This was tested as follows in Python for j = 75, i.e. the second last digit:
// let that digit's value be the smallest possible value for w = 5, i.e. -31
//  d_75 = -31
// # assuming the accumulator contains the most significant digit d_76
//  a = n + (d_75 << 375); hex(a)
// '0xf07fffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
//  hex(n)
// '0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
//  k = a + (d_75 << 375); hex(k)
// '0xe0ffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973L'
//
// # Checks
//  hex(n-k)
// '0x1f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000L'
//  hex(n - (a - (d_75 << 375)))
// '0x0L'
// => n > k
//    and a value k was found such that when adding d_75 last, the difference
//    between the accumulator a and (d_75 << 375) is n
//
//
// ----------------------------------------------------------------------------
//  Python code showing the doubling case occurrence in the Joye-Tunstall
//  recoding:
// ----------------------------------------------------------------------------
//
// from array import *
//
// # P-384 group order
// n = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFC7634D81F4372DDF581A0DB248B0A77AECEC196ACCC52973
//
// # k value that causes a doubling case in left-to-right reconstruction
// k = 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc5294d
// # k value that causes a doubling case in right-to-left reconstruction
// k_r2l = 0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
//
//
// def recode(k, w):
//     rec = array('i', [])
//     while k > (2 ** w):
//         window = k & ((2 ** (w + 1)) - 1)
//         d = window - (2 ** w)
//         k = k - d
//         k = (k >> w)
//         rec.append(d)
//     rec.append(k)
//     return rec
//
// # Rebuild k from the recoded scalar proceeding from left to right
// def rebuild_l2r(rec, w):
//     l = rec.buffer_info()[1]  # length of the recoded scalar array
//     # initialise accumulator
//     a = rec[l-1]
//     # for i from l-2 downto 0
//     for i in range(l-2,-1,-1):
//         a = (a << w)
//         if (a - rec[i]) == n:
//             print("L2R Doubling case: ")
//             print("    i =", i, " digit =", hex(rec[i]))
//             print("    a =", hex(a))
//         a += rec[i]
//     return a
//
// # Rebuild k from the recoded scalar proceeding from right to left
// def rebuild_r2l(rec, w):
//     l = rec.buffer_info()[1]  # length of the recoded scalar array
//     # initialise accumulator
//     a = rec[0]
//     # for i from 1 to l-1
//     for i in range(1,l):
//         shifted_d = rec[i] << (w*i)
//         if (shifted_d - a) == n:
//             print("R2L Doubling case: ")
//             print("    i =", i, " digit =", hex(rec[i]))
//             print("    a =", hex(a))
//         a += shifted_d
//     return a
//
// def test_recode():
//     w = 5
//
//     # Left-to-right recoding of k which causes a doubling case
//     assert k < n
//     print("k = ", hex(k))
//     # recode k
//     rec_k = recode(k,w)
//     # print(rec_k)
//     print("Digits of the recoded scalar:")
//     for a in rec_k:
//         print(hex(a), end=', ')
//     print()
//     # rebuild k
//     out_k = rebuild_l2r(rec_k, w)
//     if out_k != k:
//         print("ERROR: rebuilt value is different from recoded value")
//     print()
//
//     # Right-to-left recoding of k_r2l which causes a doubling case
//     assert k_r2l < n
//     print("k = ", hex(k_r2l))
//     # recode k_r2l
//     rec_k_r2l = recode(k_r2l,w)
//     # print(rec_k_r2l)
//     print("Digits of the recoded scalar:")
//     for a in rec_k_r2l:
//         print(hex(a), end=', ')
//     print()
//     # rebuild k_r2l
//     out_k_r2l = rebuild_r2l(rec_k_r2l, w)
//     if out_k_r2l != k_r2l:
//         print("ERROR: rebuilt R2L value is different from recoded value")
//     print()
//
// test_recode()
// '''
// Output:
// -------
// k =  0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc5294d
// Digits of the recoded scalar:
// -0x13, -0x15, -0x15, -0x15, -0x13, 0x7, 0xb, 0xd, -0x7, 0x1, 0x1b, -0x7, 0xf, 0x1d, -0x3, -0xb, 0x11, -0x1b, -0xd, 0x5, -0x5, -0x19, 0x9, -0x1d, -0x7, 0x1b, 0x17, -0x5, 0x13, -0x5, -0xf, 0x1f, -0x1f, 0xd, -0xd, -0x19, 0x17, 0x3, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0xf,
// L2R Doubling case:
//     i = 0  digit = -0x13
//     a = 0xffffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52960
//
// k =  0xe00000000000000000000000000000000000000000000000389cb27e0bc8d220a7e5f24db74f58851313e695333ad68d
// Digits of the recoded scalar:
// -0x13, 0x15, 0x15, 0x15, 0x13, -0x7, -0xb, -0xd, 0x7, -0x1, -0x1b, 0x7, -0xf, -0x1d, 0x3, 0xb, -0x11, 0x1b, 0xd, -0x5, 0x5, 0x19, -0x9, 0x1d, 0x7, -0x1b, -0x17, 0x5, -0x13, 0x5, 0xf, -0x1f, 0x1f, -0xd, 0xd, 0x19, -0x17, -0x3, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, -0x1f, 0xf,
// R2L Doubling case:
//     i = 76  digit = 0xf
//     a = -0xfffffffffffffffffffffffffffffffffffffffffffffffc7634d81f4372ddf581a0db248b0a77aecec196accc52973
// '''
//
#endif // !defined(OPENSSL_SMALL)