chore: checkpoint before Python removal

2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/add.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/add.c
@@ -0,0 +1,257 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  const BIGNUM *tmp;
+  int a_neg = a->neg, ret;
+
+  //  a +  b	a+b
+  //  a + -b	a-b
+  // -a +  b	b-a
+  // -a + -b	-(a+b)
+  if (a_neg ^ b->neg) {
+    // only one is negative
+    if (a_neg) {
+      tmp = a;
+      a = b;
+      b = tmp;
+    }
+
+    // we are now a - b
+    if (BN_ucmp(a, b) < 0) {
+      if (!BN_usub(r, b, a)) {
+        return 0;
+      }
+      r->neg = 1;
+    } else {
+      if (!BN_usub(r, a, b)) {
+        return 0;
+      }
+      r->neg = 0;
+    }
+    return 1;
+  }
+
+  ret = BN_uadd(r, a, b);
+  r->neg = a_neg;
+  return ret;
+}
+
+int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  // Widths are public, so we normalize to make |a| the larger one.
+  if (a->width < b->width) {
+    const BIGNUM *tmp = a;
+    a = b;
+    b = tmp;
+  }
+
+  int max = a->width;
+  int min = b->width;
+  if (!bn_wexpand(r, max + 1)) {
+    return 0;
+  }
+  r->width = max + 1;
+
+  BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min);
+  for (int i = min; i < max; i++) {
+    r->d[i] = CRYPTO_addc_w(a->d[i], 0, carry, &carry);
+  }
+
+  r->d[max] = carry;
+  return 1;
+}
+
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  if (!bn_uadd_consttime(r, a, b)) {
+    return 0;
+  }
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int BN_add_word(BIGNUM *a, BN_ULONG w) {
+  BN_ULONG l;
+  int i;
+
+  // degenerate case: w is zero
+  if (!w) {
+    return 1;
+  }
+
+  // degenerate case: a is zero
+  if (BN_is_zero(a)) {
+    return BN_set_word(a, w);
+  }
+
+  // handle 'a' when negative
+  if (a->neg) {
+    a->neg = 0;
+    i = BN_sub_word(a, w);
+    if (!BN_is_zero(a)) {
+      a->neg = !(a->neg);
+    }
+    return i;
+  }
+
+  for (i = 0; w != 0 && i < a->width; i++) {
+    a->d[i] = l = a->d[i] + w;
+    w = (w > l) ? 1 : 0;
+  }
+
+  if (w && i == a->width) {
+    if (!bn_wexpand(a, a->width + 1)) {
+      return 0;
+    }
+    a->width++;
+    a->d[i] = w;
+  }
+
+  return 1;
+}
+
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  int add = 0, neg = 0;
+  const BIGNUM *tmp;
+
+  //  a -  b	a-b
+  //  a - -b	a+b
+  // -a -  b	-(a+b)
+  // -a - -b	b-a
+  if (a->neg) {
+    if (b->neg) {
+      tmp = a;
+      a = b;
+      b = tmp;
+    } else {
+      add = 1;
+      neg = 1;
+    }
+  } else {
+    if (b->neg) {
+      add = 1;
+      neg = 0;
+    }
+  }
+
+  if (add) {
+    if (!BN_uadd(r, a, b)) {
+      return 0;
+    }
+
+    r->neg = neg;
+    return 1;
+  }
+
+  if (BN_ucmp(a, b) < 0) {
+    if (!BN_usub(r, b, a)) {
+      return 0;
+    }
+    r->neg = 1;
+  } else {
+    if (!BN_usub(r, a, b)) {
+      return 0;
+    }
+    r->neg = 0;
+  }
+
+  return 1;
+}
+
+int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  // |b| may have more words than |a| given non-minimal inputs, but all words
+  // beyond |a->width| must then be zero.
+  int b_width = b->width;
+  if (b_width > a->width) {
+    if (!bn_fits_in_words(b, a->width)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3);
+      return 0;
+    }
+    b_width = a->width;
+  }
+
+  if (!bn_wexpand(r, a->width)) {
+    return 0;
+  }
+
+  BN_ULONG borrow = bn_sub_words(r->d, a->d, b->d, b_width);
+  for (int i = b_width; i < a->width; i++) {
+    r->d[i] = CRYPTO_subc_w(a->d[i], 0, borrow, &borrow);
+  }
+
+  if (borrow) {
+    OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3);
+    return 0;
+  }
+
+  r->width = a->width;
+  r->neg = 0;
+  return 1;
+}
+
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  if (!bn_usub_consttime(r, a, b)) {
+    return 0;
+  }
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int BN_sub_word(BIGNUM *a, BN_ULONG w) {
+  int i;
+
+  // degenerate case: w is zero
+  if (!w) {
+    return 1;
+  }
+
+  // degenerate case: a is zero
+  if (BN_is_zero(a)) {
+    i = BN_set_word(a, w);
+    if (i != 0) {
+      BN_set_negative(a, 1);
+    }
+    return i;
+  }
+
+  // handle 'a' when negative
+  if (a->neg) {
+    a->neg = 0;
+    i = BN_add_word(a, w);
+    a->neg = 1;
+    return i;
+  }
+
+  if ((bn_minimal_width(a) == 1) && (a->d[0] < w)) {
+    a->d[0] = w - a->d[0];
+    a->neg = 1;
+    return 1;
+  }
+
+  i = 0;
+  for (;;) {
+    if (a->d[i] >= w) {
+      a->d[i] -= w;
+      break;
+    } else {
+      a->d[i] -= w;
+      i++;
+      w = 1;
+    }
+  }
+
+  if ((a->d[i] == 0) && (i == (a->width - 1))) {
+    a->width--;
+  }
+
+  return 1;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -0,0 +1,733 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.global	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+
+.align	5
+bn_mul_mont_nohw:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$tj,sp
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,$tj		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	$tj,sp
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	mov	$aj,sp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,$aj		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
+	ldr	$aj,[$rp]
+	str	sp,[$tp],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	$aj,$tj
+	str	$aj,[$rp],#4
+	teq	$tp,$num		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,$num
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+___
+{
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my @ACC=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	bn_mul8x_mont_neon
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	$num,#8
+	bhi	.LNEON_8n
+
+	@ special case for $num==8, everything is in register bank...
+
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	sub		$toutptr,sp,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	vzip.16		$Bi,$zero
+
+	vmull.u32	@ACC[0],$Bi,${A0}[0]
+	vmull.u32	@ACC[1],$Bi,${A0}[1]
+	vmull.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmull.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmull.u32	@ACC[4],$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	@ACC[5],$Bi,${A2}[1]
+	vmull.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	mov		$inner,$num
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	add		$tinptr,sp,#96
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor		@ACC[0],@ACC[0],@ACC[0]
+	 sub		$toutptr,sp,#128
+	veor		@ACC[1],@ACC[1],@ACC[1]
+	 sub		$toutptr,$toutptr,$num,lsl#4
+	veor		@ACC[2],@ACC[2],@ACC[2]
+	 and		$toutptr,$toutptr,#-64
+	veor		@ACC[3],@ACC[3],@ACC[3]
+	 mov		sp,$toutptr			@ alloca
+	veor		@ACC[4],@ACC[4],@ACC[4]
+	 add		$toutptr,$toutptr,#256
+	veor		@ACC[5],@ACC[5],@ACC[5]
+	 sub		$inner,$num,#8
+	veor		@ACC[6],@ACC[6],@ACC[6]
+	veor		@ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	subs		$inner,$inner,#8
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
+	bne		.LNEON_8n_init
+
+	add		$tinptr,sp,#256
+	vld1.32		{$A0-$A3},[$aptr]!
+	add		$bnptr,sp,#8
+	vld1.32		{${M0}[0]},[$n0,:32]
+	mov		$outer,$num
+	b		.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	add		$toutptr,sp,#128
+	vld1.32		{$N0-$N3},[$nptr]!
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	veor		$temp,$temp,$temp
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vzip.16		$Bi,$temp
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
+___
+	push(@ACC,shift(@ACC));	$i++;
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]!
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
+	add		$bnptr,sp,#8		@ rewind
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	sub		$inner,$num,#8
+	b		.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs		$inner,$inner,#8
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vld1.32		{$N0-$N3},[$nptr]!
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	it		eq
+	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	add		$bnptr,sp,#8		@ rewind
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+
+	bne		.LNEON_8n_inner
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	add		$tinptr,sp,#128
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	veor		q2,q2,q2		@ $N0-$N1
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	veor		q3,q3,q3		@ $N2-$N3
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]},[$toutptr,:128]
+
+	subs		$outer,$outer,#8
+	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
+	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
+	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
+	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+	itt		ne
+	subne		$nptr,$nptr,$num,lsl#2	@ rewind
+	bne		.LNEON_8n_outer
+
+	add		$toutptr,sp,#128
+	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vst1.64		{q2-q3},[sp,:256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vst1.64		{q2-q3}, [sp,:256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vst1.64		{q2-q3}, [sp,:256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	mov		$inner,$num
+	b		.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
+	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,@ACC[1]#lo,#16
+	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
+	vshr.u64	$temp,@ACC[1]#hi,#16
+	vzip.16		@ACC[1]#lo,@ACC[1]#hi
+___
+	push(@ACC,shift(@ACC));
+}
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
+	subs		$inner,$inner,#8
+	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,$bptr,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	ret						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
+	s/\bret\b/bx    lr/g						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv8-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv8-mont.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -0,0 +1,574 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("maw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+		&jmp(&label("maw_sse2_entry"));
+
+	&set_label("maw_sse2_unrolled",16);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&lea($r,&DWP(32,$r));
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("maw_sse2_loop"));
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
+		&emms();
+		&ret();
+
+	&set_label("maw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+= c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a));	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-armv8.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+# Copyright (c) 2023, Google Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3");
+my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8");
+my $code = <<____;
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_add_words, %function
+.globl	bn_add_words
+.align	4
+bn_add_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Ladd_tail
+.Ladd_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	adcs	$a0, $a0, $b0
+	adcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Ladd_loop
+
+.Ladd_tail:
+	cbz	$num, .Ladd_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	adcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Ladd_exit:
+	cset	x0, cs
+	ret
+.cfi_endproc
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_sub_words, %function
+.globl	bn_sub_words
+.align	4
+bn_sub_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Lsub_tail
+.Lsub_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	sbcs	$a0, $a0, $b0
+	sbcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Lsub_loop
+
+.Lsub_tail:
+	cbz	$num, .Lsub_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	sbcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Lsub_exit:
+	cset x0, cc
+	ret
+.cfi_endproc
+.size   bn_sub_words,.-bn_sub_words
+____
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/co-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/co-586.pl
@@ -0,0 +1,298 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i");
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
@@ -0,0 +1,698 @@
+# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2020, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov.
+# Special thanks to Ilya Albrekht for his valuable hints.
+# Intel Corporation
+#
+# December 2020
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+----------------------+--------------+-------------|
+# |         | OpenSSL 3.0.0-alpha9 | this         | Unit        |
+# |---------+----------------------+--------------+-------------|
+# | rsa2048 | 2 127 659            | 1 015 625    | cycles/sign |
+# |         | 611                  | 1280 / +109% | sign/s      |
+# |---------+----------------------+--------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19)));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23)));
+
+# Registers mapping for normalization.
+my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26)));
+
+sub amm52x20_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $zero, $_R2
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[bcd]x
+sub amm52x20_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of R0..R2
+    # Save them to LSB of QWs in T0..T2
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+
+    # "Shift left" T0..T2 by 1 QW
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..R2 QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+
+    # Sum R0..R2 with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs which 52-bit parts overflow...
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R0,  %k1 # OP=nle (i.e. gt)
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R0h, %k2
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R1,  %k3
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R1h, %k4
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R2,  %k5
+    kmovb   %k1, %r14d                   # k1
+    kmovb   %k2, %r13d                   # k1h
+    kmovb   %k3, %r12d                   # k2
+    kmovb   %k4, %r11d                   # k2h
+    kmovb   %k5, %r10d                   # k3
+
+    # ...or saturated
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R0,  %k1 # OP=eq
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R0h, %k2
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R1,  %k3
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R1h, %k4
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R2,  %k5
+    kmovb   %k1, %r9d                    # k4
+    kmovb   %k2, %r8d                    # k4h
+    kmovb   %k3, %ebx                    # k5
+    kmovb   %k4, %ecx                    # k5h
+    kmovb   %k5, %edx                    # k6
+
+    # Get mask of QWs where carries shall be propagated to.
+    # Merge 4-bit masks to 8-bit values to use add with carry.
+    shl   \$4, %r13b
+    or    %r13b, %r14b
+    shl   \$4, %r11b
+    or    %r11b, %r12b
+
+    add   %r14b, %r14b
+    adc   %r12b, %r12b
+    adc   %r10b, %r10b
+
+    shl   \$4, %r8b
+    or    %r8b,%r9b
+    shl   \$4, %cl
+    or    %cl, %bl
+
+    add   %r9b, %r14b
+    adc   %bl, %r12b
+    adc   %dl, %r10b
+
+    xor   %r9b, %r14b
+    xor   %bl, %r12b
+    xor   %dl, %r10b
+
+    kmovb   %r14d, %k1
+    shr     \$4, %r14b
+    kmovb   %r14d, %k2
+    kmovb   %r12d, %k3
+    shr     \$4, %r12b
+    kmovb   %r12d, %k4
+    kmovb   %r10d, %k5
+
+    # Add carries according to the obtained mask
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+
+    vpandq   .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq   .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq   .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq   .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq   .Lmask52x4(%rip), $_R2,  $_R2
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x20_x1_ifma256
+.type   rsaz_amm52x20_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x20_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+.Lrsaz_amm52x20_x1_ifma256_body:
+
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 20 digits unrolled by 4
+    mov     \$5, $iter
+
+.align 32
+.Lloop5:
+___
+    foreach my $idx (0..3) {
+        &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop5
+___
+    &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+
+    vzeroupper
+    mov  0(%rsp),%r15
+.cfi_restore    %r15
+    mov  8(%rsp),%r14
+.cfi_restore    %r14
+    mov  16(%rsp),%r13
+.cfi_restore    %r13
+    mov  24(%rsp),%r12
+.cfi_restore    %r12
+    mov  32(%rsp),%rbp
+.cfi_restore    %rbp
+    mov  40(%rsp),%rbx
+.cfi_restore    %rbx
+    lea  48(%rsp),%rsp
+.cfi_adjust_cfa_offset  -48
+.Lrsaz_amm52x20_x1_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
+#                               const BN_ULONG a[2][20],
+#                               const BN_ULONG b[2][20],
+#                               const BN_ULONG m[2][20],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x20_x2_ifma256
+.type   rsaz_amm52x20_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x20_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+.Lrsaz_amm52x20_x2_ifma256_body:
+
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$20, $iter
+
+.align 32
+.Lloop20:
+___
+    &amm52x20_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)");
+    # 20*8 = offset of the next dimension in two-dimension array
+    &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop20
+___
+    &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
+    &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+
+    vmovdqu64   $R0_1,  `5*32`($res)
+    vmovdqu64   $R0_1h, `6*32`($res)
+    vmovdqu64   $R1_1,  `7*32`($res)
+    vmovdqu64   $R1_1h, `8*32`($res)
+    vmovdqu64   $R2_1,  `9*32`($res)
+
+    vzeroupper
+    mov  0(%rsp),%r15
+.cfi_restore    %r15
+    mov  8(%rsp),%r14
+.cfi_restore    %r14
+    mov  16(%rsp),%r13
+.cfi_restore    %r13
+    mov  24(%rsp),%r12
+.cfi_restore    %r12
+    mov  32(%rsp),%rbp
+.cfi_restore    %rbp
+    mov  40(%rsp),%rbx
+.cfi_restore    %rbx
+    lea  48(%rsp),%rsp
+.cfi_adjust_cfa_offset  -48
+.Lrsaz_amm52x20_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x20_x2_ifma256, .-rsaz_amm52x20_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x20_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x20_win5
+.type   extract_multiplier_2x20_win5,\@abi-omnipotent
+extract_multiplier_2x20_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*20*8`($red_tbl), %rax  # holds end of the tbl
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+    vmovdqa64   $t0, $cur_idx
+___
+foreach (1..9) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+$code.=<<___;
+
+.align 32
+.Lloop:
+    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
+    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
+___
+foreach (0..9) {
+    my $mask = $_<5?"%k1":"%k2";
+$code.=<<___;
+    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
+    addq    \$`2*20*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop
+___
+# store t0..n
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
+}
+$code.=<<___;
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x20_win5, .-extract_multiplier_2x20_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_def_handler,\@abi-omnipotent
+.align  16
+rsaz_def_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    lea     48(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_def_handler,.-rsaz_def_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x20_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_def_handler
+    .rva    .Lrsaz_amm52x20_x1_ifma256_body,.Lrsaz_amm52x20_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x20_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_def_handler
+    .rva    .Lrsaz_amm52x20_x2_ifma256_body,.Lrsaz_amm52x20_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+.globl  rsaz_amm52x20_x1_ifma256
+.globl  rsaz_amm52x20_x2_ifma256
+.globl  extract_multiplier_2x20_win5
+.type   rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x20_x1_ifma256:
+rsaz_amm52x20_x2_ifma256:
+extract_multiplier_2x20_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
@@ -0,0 +1,854 @@
+# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov
+# Intel Corporation
+#
+# March 2021
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+-----------------------+---------------+-------------|
+# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
+# |---------+-----------------------+---------------+-------------|
+# | rsa3072 | 6 397 637             | 2 866 593     | cycles/sign |
+# |         | 203.2                 | 453.5 / +123% | sign/s      |
+# |---------+-----------------------+---------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
+
+# Registers mapping for normalization
+my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
+
+sub amm52x30_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $_R2h, $_R2
+    valignq     \$1, $_R2h, $_R3, $_R2h
+    valignq     \$1, $_R3, $_R3h, $_R3
+    valignq     \$1, $_R3h, $zero, $_R3h
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[abcd]x
+sub amm52x30_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of the bignum
+    # Save them to LSB of QWs in T0..Tn
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+    vpsrlq    \$52, $_R2h,  $T2h
+    vpsrlq    \$52, $_R3,   $T3
+    vpsrlq    \$52, $_R3h,  $T3h
+
+    # "Shift left" T0..Tn by 1 QW
+    valignq \$3, $T3,  $T3h,  $T3h
+    valignq \$3, $T2h,  $T3,  $T3
+    valignq \$3, $T2,  $T2h,  $T2h
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..Rn QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
+    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
+
+    # Sum R0..Rn with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+    vpaddq  $T2h, $_R2h, $_R2h
+    vpaddq  $T3,  $_R3,  $_R3
+    vpaddq  $T3h, $_R3h, $_R3h
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs whose 52-bit parts overflow
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r14d
+    kmovb      %k2,%r13d
+    shl        \$4,%r13b
+    or         %r13b,%r14b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r13d
+    kmovb      %k2,%r12d
+    shl        \$4,%r12b
+    or         %r12b,%r13b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%r12d
+    kmovb      %k2,%r11d
+    shl        \$4,%r11b
+    or         %r11b,%r12b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%r11d
+    kmovb      %k2,%r10d
+    shl        \$4,%r10b
+    or         %r10b,%r11b
+
+    addb       %r14b,%r14b
+    adcb       %r13b,%r13b
+    adcb       %r12b,%r12b
+    adcb       %r11b,%r11b
+
+    # Get mask of QWs whose 52-bit parts saturated
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r9d
+    kmovb      %k2,%r8d
+    shl        \$4,%r8b
+    or         %r8b,%r9b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r8d
+    kmovb      %k2,%edx
+    shl        \$4,%dl
+    or         %dl,%r8b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%edx
+    kmovb      %k2,%ecx
+    shl        \$4,%cl
+    or         %cl,%dl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%ecx
+    kmovb      %k2,%ebx
+    shl        \$4,%bl
+    or         %bl,%cl
+
+    addb     %r9b,%r14b
+    adcb     %r8b,%r13b
+    adcb     %dl,%r12b
+    adcb     %cl,%r11b
+
+    xor      %r9b,%r14b
+    xor      %r8b,%r13b
+    xor      %dl,%r12b
+    xor      %cl,%r11b
+
+    kmovb    %r14d,%k1
+    shr      \$4,%r14b
+    kmovb    %r14d,%k2
+    kmovb    %r13d,%k3
+    shr      \$4,%r13b
+    kmovb    %r13d,%k4
+    kmovb    %r12d,%k5
+    shr      \$4,%r12b
+    kmovb    %r12d,%k6
+    kmovb    %r11d,%k7
+
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
+    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
+
+    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
+
+    shr    \$4,%r11b
+    kmovb   %r11d,%k1
+
+    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
+
+    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x30_x1_ifma256
+.type   rsaz_amm52x30_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x30_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x30_x1_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 30 digits unrolled by 4
+    mov     \$7, $iter
+
+.align 32
+.Lloop7:
+___
+    foreach my $idx (0..3) {
+        &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop7
+___
+    &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+    &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+
+    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea  168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp       # restore rsp
+.cfi_def_cfa %rsp,8
+.Lrsaz_amm52x30_x1_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
+#                               const BN_ULONG a[2][32],
+#                               const BN_ULONG b[2][32],
+#                               const BN_ULONG m[2][32],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x30_x2_ifma256
+.type   rsaz_amm52x30_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x30_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp
+    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x30_x2_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+    vmovdqa64   $zero, $R2_1h
+    vmovdqa64   $zero, $R3_1
+    vmovdqa64   $zero, $R3_1h
+
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$30, $iter
+
+.align 32
+.Lloop30:
+___
+    &amm52x30_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
+    # 32*8 = offset of the next dimension in two-dimension array
+    &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop30
+___
+    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
+    &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+
+    vmovdqu64   $R0_1,  `8*32`($res)
+    vmovdqu64   $R0_1h, `9*32`($res)
+    vmovdqu64   $R1_1,  `10*32`($res)
+    vmovdqu64   $R1_1h, `11*32`($res)
+    vmovdqu64   $R2_1,  `12*32`($res)
+    vmovdqu64   $R2_1h, `13*32`($res)
+    vmovdqu64   $R3_1,  `14*32`($res)
+    vmovdqu64   $R3_1h, `15*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea     168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp
+.cfi_def_cfa    %rsp,8
+.Lrsaz_amm52x30_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x30_x2_ifma256, .-rsaz_amm52x30_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x30_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x30_win5
+.type   extract_multiplier_2x30_win5,\@abi-omnipotent
+extract_multiplier_2x30_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*32*8`($red_tbl), %rax  # holds end of the tbl
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+    vmovdqa64   $t0, $cur_idx
+___
+foreach (1..15) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+$code.=<<___;
+
+.align 32
+.Lloop:
+    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
+    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
+___
+foreach (0..15) {
+    my $mask = $_<8?"%k1":"%k2";
+$code.=<<___;
+    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
+    addq    \$`2*32*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop
+___
+# store t0..n
+foreach (0..15) {
+    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
+}
+$code.=<<___;
+
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x30_win5, .-extract_multiplier_2x30_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_avx_handler,\@abi-omnipotent
+.align  16
+rsaz_avx_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    lea     (%rax),%rsi         # %xmm save area
+    lea     512($context),%rdi  # & context.Xmm6
+    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
+    .long   0xa548f3fc          # cld; rep movsq
+
+    lea     `48+168`(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_avx_handler,.-rsaz_avx_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x30_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x30_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x30_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x30_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x30_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x30_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x30_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x30_x1_ifma256_body,.Lrsaz_amm52x30_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x30_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x30_x2_ifma256_body,.Lrsaz_amm52x30_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x30_x1_ifma256
+.globl  rsaz_amm52x30_x2_ifma256
+.globl  extract_multiplier_2x30_win5
+.type   rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x30_x1_ifma256:
+rsaz_amm52x30_x2_ifma256:
+extract_multiplier_2x30_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
@@ -0,0 +1,915 @@
+# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov
+# Intel Corporation
+#
+# March 2021
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+-----------------------+---------------+-------------|
+# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
+# |---------+-----------------------+---------------+-------------|
+# | rsa4096 | 14 301 4300           | 5 813 953     | cycles/sign |
+# |         | 90.9                  | 223.6 / +146% | sign/s      |
+# |---------+-----------------------+---------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22));
+
+# Registers mapping for normalization
+my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29)));
+
+sub amm52x40_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+    vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4
+    vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+    vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4
+    vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $_R2h, $_R2
+    valignq     \$1, $_R2h, $_R3, $_R2h
+    valignq     \$1, $_R3, $_R3h, $_R3
+    valignq     \$1, $_R3h, $_R4, $_R3h
+    valignq     \$1, $_R4, $_R4h, $_R4
+    valignq     \$1, $_R4h, $zero, $_R4h
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+    vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4
+    vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+    vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4
+    vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[abcd]x
+sub amm52x40_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of the bignum
+    # Save them to LSB of QWs in T0..Tn
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+    vpsrlq    \$52, $_R2h,  $T2h
+    vpsrlq    \$52, $_R3,   $T3
+    vpsrlq    \$52, $_R3h,  $T3h
+    vpsrlq    \$52, $_R4,   $T4
+    vpsrlq    \$52, $_R4h,  $T4h
+
+    # "Shift left" T0..Tn by 1 QW
+    valignq \$3, $T4,  $T4h,  $T4h
+    valignq \$3, $T3h,  $T4,  $T4
+    valignq \$3, $T3,  $T3h,  $T3h
+    valignq \$3, $T2h,  $T3,  $T3
+    valignq \$3, $T2,  $T2h,  $T2h
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..Rn QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
+    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
+    vpandq    .Lmask52x4(%rip), $_R4,  $_R4
+    vpandq    .Lmask52x4(%rip), $_R4h, $_R4h
+
+    # Sum R0..Rn with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+    vpaddq  $T2h, $_R2h, $_R2h
+    vpaddq  $T3,  $_R3,  $_R3
+    vpaddq  $T3h, $_R3h, $_R3h
+    vpaddq  $T4,  $_R4,  $_R4
+    vpaddq  $T4h, $_R4h, $_R4h
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs whose 52-bit parts overflow
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r14d
+    kmovb      %k2,%r13d
+    shl        \$4,%r13b
+    or         %r13b,%r14b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r13d
+    kmovb      %k2,%r12d
+    shl        \$4,%r12b
+    or         %r12b,%r13b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%r12d
+    kmovb      %k2,%r11d
+    shl        \$4,%r11b
+    or         %r11b,%r12b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%r11d
+    kmovb      %k2,%r10d
+    shl        \$4,%r10b
+    or         %r10b,%r11b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R4},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R4h},%k2
+    kmovb      %k1,%r10d
+    kmovb      %k2,%r9d
+    shl        \$4,%r9b
+    or         %r9b,%r10b
+
+    addb       %r14b,%r14b
+    adcb       %r13b,%r13b
+    adcb       %r12b,%r12b
+    adcb       %r11b,%r11b
+    adcb       %r10b,%r10b
+
+    # Get mask of QWs whose 52-bit parts saturated
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r9d
+    kmovb      %k2,%r8d
+    shl        \$4,%r8b
+    or         %r8b,%r9b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r8d
+    kmovb      %k2,%edx
+    shl        \$4,%dl
+    or         %dl,%r8b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%edx
+    kmovb      %k2,%ecx
+    shl        \$4,%cl
+    or         %cl,%dl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%ecx
+    kmovb      %k2,%ebx
+    shl        \$4,%bl
+    or         %bl,%cl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R4},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R4h},%k2
+    kmovb      %k1,%ebx
+    kmovb      %k2,%eax
+    shl        \$4,%al
+    or         %al,%bl
+
+    addb     %r9b,%r14b
+    adcb     %r8b,%r13b
+    adcb     %dl,%r12b
+    adcb     %cl,%r11b
+    adcb     %bl,%r10b
+
+    xor      %r9b,%r14b
+    xor      %r8b,%r13b
+    xor      %dl,%r12b
+    xor      %cl,%r11b
+    xor      %bl,%r10b
+
+    kmovb    %r14d,%k1
+    shr      \$4,%r14b
+    kmovb    %r14d,%k2
+    kmovb    %r13d,%k3
+    shr      \$4,%r13b
+    kmovb    %r13d,%k4
+    kmovb    %r12d,%k5
+    shr      \$4,%r12b
+    kmovb    %r12d,%k6
+    kmovb    %r11d,%k7
+
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
+    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
+
+    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
+
+    shr    \$4,%r11b
+    kmovb   %r11d,%k1
+    kmovb   %r10d,%k2
+    shr    \$4,%r10b
+    kmovb   %r10d,%k3
+
+    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R4,  ${_R4}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3}
+
+    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
+    vpandq  .Lmask52x4(%rip), $_R4,  $_R4
+    vpandq  .Lmask52x4(%rip), $_R4h, $_R4h
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x40_x1_ifma256
+.type   rsaz_amm52x40_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x40_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x40_x1_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+    vmovdqa64   $zero, $R4_0
+    vmovdqa64   $zero, $R4_0h
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 40 digits unrolled by 4
+    mov     \$10, $iter
+
+.align 32
+.Lloop10:
+___
+    foreach my $idx (0..3) {
+        &amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop10
+___
+    &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+    vmovdqu64   $R4_0,  `8*32`($res)
+    vmovdqu64   $R4_0h, `9*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea  168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp       # restore rsp
+.cfi_def_cfa %rsp,8
+.Lrsaz_amm52x40_x1_ifma256_epilogue:
+
+    ret
+.cfi_endproc
+.size   rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
+#                               const BN_ULONG a[2][40],
+#                               const BN_ULONG b[2][40],
+#                               const BN_ULONG m[2][40],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x40_x2_ifma256
+.type   rsaz_amm52x40_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x40_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp
+    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x40_x2_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+    vmovdqa64   $zero, $R4_0
+    vmovdqa64   $zero, $R4_0h
+
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+    vmovdqa64   $zero, $R2_1h
+    vmovdqa64   $zero, $R3_1
+    vmovdqa64   $zero, $R3_1h
+    vmovdqa64   $zero, $R4_1
+    vmovdqa64   $zero, $R4_1h
+
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$40, $iter
+
+.align 32
+.Lloop40:
+___
+    &amm52x40_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)");
+    # 40*8 = offset of the next dimension in two-dimension array
+    &amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop40
+___
+    &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
+    &amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+    vmovdqu64   $R4_0,  `8*32`($res)
+    vmovdqu64   $R4_0h, `9*32`($res)
+
+    vmovdqu64   $R0_1,  `10*32`($res)
+    vmovdqu64   $R0_1h, `11*32`($res)
+    vmovdqu64   $R1_1,  `12*32`($res)
+    vmovdqu64   $R1_1h, `13*32`($res)
+    vmovdqu64   $R2_1,  `14*32`($res)
+    vmovdqu64   $R2_1h, `15*32`($res)
+    vmovdqu64   $R3_1,  `16*32`($res)
+    vmovdqu64   $R3_1h, `17*32`($res)
+    vmovdqu64   $R4_1,  `18*32`($res)
+    vmovdqu64   $R4_1h, `19*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea     168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp
+.cfi_def_cfa    %rsp,8
+.Lrsaz_amm52x40_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x40_x2_ifma256, .-rsaz_amm52x40_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x40_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+sub get_table_value_consttime() {
+my ($_idx,$_offset) = @_;
+$code.=<<___;
+    vpxorq   $cur_idx, $cur_idx, $cur_idx
+.align 32
+.Lloop_$_offset:
+    vpcmpq  \$0, $cur_idx, $_idx, %k1      # mask of (idx == cur_idx)
+___
+foreach (0..9) {
+$code.=<<___;
+    vmovdqu64  `$_offset+${_}*32`($red_tbl), $tmp   # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{%k1}          # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx # increment cur_idx
+    addq    \$`2*40*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop_$_offset
+___
+}
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x40_win5
+.type   extract_multiplier_2x40_win5,\@abi-omnipotent
+extract_multiplier_2x40_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*40*8`($red_tbl), %rax  # holds end of the tbl
+
+    # backup red_tbl address
+    movq    $red_tbl, %r10
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+___
+foreach (1..9) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+
+&get_table_value_consttime($idx1, 0);
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `(0+$_)*32`($out) \n";
+}
+$code.="movq    %r10, $red_tbl \n";
+&get_table_value_consttime($idx2, 40*8);
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `(10+$_)*32`($out) \n";
+}
+$code.=<<___;
+
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x40_win5, .-extract_multiplier_2x40_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_avx_handler,\@abi-omnipotent
+.align  16
+rsaz_avx_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    lea     (%rax),%rsi         # %xmm save area
+    lea     512($context),%rdi  # & context.Xmm6
+    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
+    .long   0xa548f3fc          # cld; rep movsq
+
+    lea     `48+168`(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_avx_handler,.-rsaz_avx_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x40_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x40_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x40_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x40_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x40_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x40_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x40_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x40_x1_ifma256_body,.Lrsaz_amm52x40_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x40_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x40_x2_ifma256_body,.Lrsaz_amm52x40_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x40_x1_ifma256
+.globl  rsaz_amm52x40_x2_ifma256
+.globl  extract_multiplier_2x40_win5
+.type   rsaz_amm52x40_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x40_x1_ifma256:
+rsaz_amm52x40_x2_ifma256:
+extract_multiplier_2x40_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-avx2.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-avx2.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -0,0 +1,628 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arranging 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","ebp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","ebp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("ebp","edx");		# this splits them apart modulo 4096
+
+	&and	("ebp",-64);		# align to cache line
+
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	&mov	("eax","esp");
+	&sub	("eax","ebp");
+	&and	("eax",-4096);
+	&mov	("edx","esp");		# saved stack pointer!
+	&lea	("esp",&DWP(0,"ebp","eax"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+	&jmp	(&label("page_walk_done"));
+
+&set_label("page_walk",16);
+	&lea	("esp",&DWP(-4096,"esp"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+&set_label("page_walk_done");
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"ebp");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"edx");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengths on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&add	("edx","edx");
+	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&mov	("edx",-1);
+	&xor	("edx","eax");
+	&jmp	(&label("copy"));
+
+&set_label("copy",16);				# conditional copy
+	&mov	($tp,&DWP($frame,"esp",$num,4));
+	&mov	($np,&DWP(0,$rp,$num,4));
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&and	($tp,"eax");
+	&and	($np,"edx");
+	&or	($np,$tp);
+	&mov	(&DWP(0,$rp,$num,4),$np);
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-gcc.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-gcc.c
@@ -0,0 +1,534 @@
+/* x86_64 BIGNUM accelerator version 0.1, December 2002.
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL project.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ *    versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ *    proof-of-concept hack. As you can see it's implemented with
+ *    inline assembler, which means that you're bound to GCC and that
+ *    there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ *    why I decided to let the compiler take care of subroutine
+ *    prologue/epilogue as well as register allocation. For reference.
+ *    Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
+ *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
+ *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
+ *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
+ *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
+ *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
+ *
+ *    'apps/openssl speed rsa dsa' output with this module:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
+ *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
+ *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
+ *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
+ *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
+ *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
+ *
+ *    For the reference. IA-32 assembler implementation performs
+ *    very much like 64-bit code compiled with no-asm on the same
+ *    machine.
+ */
+
+#include <openssl/bn.h>
+
+// TODO(davidben): Get this file working on MSVC x64.
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
+
+#include "../internal.h"
+
+
+#undef mul
+#undef mul_add
+
+// "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
+#define mul_add(r, a, word, carry)                                         \
+  do {                                                                     \
+    register BN_ULONG high, low;                                           \
+    __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+r"(carry), "+d"(high)                                      \
+            : "a"(low)                                                     \
+            : "cc");                                                       \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+m"(r), "+d"(high)                                          \
+            : "r"(carry)                                                   \
+            : "cc");                                                       \
+    (carry) = high;                                                        \
+  } while (0)
+
+#define mul(r, a, word, carry)                                             \
+  do {                                                                     \
+    register BN_ULONG high, low;                                           \
+    __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+r"(carry), "+d"(high)                                      \
+            : "a"(low)                                                     \
+            : "cc");                                                       \
+    (r) = (carry);                                                         \
+    (carry) = high;                                                        \
+  } while (0)
+#undef sqr
+#define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                          BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return (c1);
+  }
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], w, c1);
+    mul_add(rp[1], ap[1], w, c1);
+    mul_add(rp[2], ap[2], w, c1);
+    mul_add(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul_add(rp[0], ap[0], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul_add(rp[1], ap[1], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul_add(rp[2], ap[2], w, c1);
+    return c1;
+  }
+
+  return c1;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                      BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], w, c1);
+    mul(rp[1], ap[1], w, c1);
+    mul(rp[2], ap[2], w, c1);
+    mul(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul(rp[0], ap[0], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul(rp[1], ap[1], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul(rp[2], ap[2], w, c1);
+  }
+  return c1;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
+  if (n == 0) {
+    return;
+  }
+
+  while (n & ~3) {
+    sqr(r[0], r[1], a[0]);
+    sqr(r[2], r[3], a[1]);
+    sqr(r[4], r[5], a[2]);
+    sqr(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  if (n) {
+    sqr(r[0], r[1], a[0]);
+    if (--n == 0) {
+      return;
+    }
+    sqr(r[2], r[3], a[1]);
+    if (--n == 0) {
+      return;
+    }
+    sqr(r[4], r[5], a[2]);
+  }
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n == 0) {
+    return 0;
+  }
+
+  __asm__ volatile (
+      "	subq	%0,%0		\n"  // clear carry
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:"
+      "	movq	(%4,%2,8),%0	\n"
+      "	adcq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	dec	%1		\n"
+      "	jnz	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+&c"(n), "+&r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc", "memory");
+
+  return ret & 1;
+}
+
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n == 0) {
+    return 0;
+  }
+
+  __asm__ volatile (
+      "	subq	%0,%0		\n"  // clear borrow
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:"
+      "	movq	(%4,%2,8),%0	\n"
+      "	sbbq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	dec	%1		\n"
+      "	jnz	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+&c"(n), "+&r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc", "memory");
+
+  return ret & 1;
+}
+
+// mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0)
+// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
+// sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0)
+// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
+
+// Keep in mind that carrying into high part of multiplication result can not
+// overflow, because it cannot be all-ones.
+#define mul_add_c(a, b, c0, c1, c2)                                  \
+  do {                                                               \
+    BN_ULONG t1, t2;                                                 \
+    __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+  } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2)                               \
+  do {                                                            \
+    BN_ULONG t1, t2;                                              \
+    __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                  \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                       \
+            : "r"(t1), "r"(t2)                                    \
+            : "cc");                                              \
+  } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2)                                 \
+  do {                                                               \
+    BN_ULONG t1, t2;                                                 \
+    __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+  } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[4], b[0], c2, c3, c1);
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  mul_add_c(a[0], b[4], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[0], b[5], c3, c1, c2);
+  mul_add_c(a[1], b[4], c3, c1, c2);
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  mul_add_c(a[4], b[1], c3, c1, c2);
+  mul_add_c(a[5], b[0], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[6], b[0], c1, c2, c3);
+  mul_add_c(a[5], b[1], c1, c2, c3);
+  mul_add_c(a[4], b[2], c1, c2, c3);
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  mul_add_c(a[2], b[4], c1, c2, c3);
+  mul_add_c(a[1], b[5], c1, c2, c3);
+  mul_add_c(a[0], b[6], c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[7], c2, c3, c1);
+  mul_add_c(a[1], b[6], c2, c3, c1);
+  mul_add_c(a[2], b[5], c2, c3, c1);
+  mul_add_c(a[3], b[4], c2, c3, c1);
+  mul_add_c(a[4], b[3], c2, c3, c1);
+  mul_add_c(a[5], b[2], c2, c3, c1);
+  mul_add_c(a[6], b[1], c2, c3, c1);
+  mul_add_c(a[7], b[0], c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[1], c3, c1, c2);
+  mul_add_c(a[6], b[2], c3, c1, c2);
+  mul_add_c(a[5], b[3], c3, c1, c2);
+  mul_add_c(a[4], b[4], c3, c1, c2);
+  mul_add_c(a[3], b[5], c3, c1, c2);
+  mul_add_c(a[2], b[6], c3, c1, c2);
+  mul_add_c(a[1], b[7], c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  mul_add_c(a[2], b[7], c1, c2, c3);
+  mul_add_c(a[3], b[6], c1, c2, c3);
+  mul_add_c(a[4], b[5], c1, c2, c3);
+  mul_add_c(a[5], b[4], c1, c2, c3);
+  mul_add_c(a[6], b[3], c1, c2, c3);
+  mul_add_c(a[7], b[2], c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  mul_add_c(a[7], b[3], c2, c3, c1);
+  mul_add_c(a[6], b[4], c2, c3, c1);
+  mul_add_c(a[5], b[5], c2, c3, c1);
+  mul_add_c(a[4], b[6], c2, c3, c1);
+  mul_add_c(a[3], b[7], c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  mul_add_c(a[4], b[7], c3, c1, c2);
+  mul_add_c(a[5], b[6], c3, c1, c2);
+  mul_add_c(a[6], b[5], c3, c1, c2);
+  mul_add_c(a[7], b[4], c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  mul_add_c(a[7], b[5], c1, c2, c3);
+  mul_add_c(a[6], b[6], c1, c2, c3);
+  mul_add_c(a[5], b[7], c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  mul_add_c(a[6], b[7], c2, c3, c1);
+  mul_add_c(a[7], b[6], c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[7], c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  sqr_add_c2(a, 4, 0, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 5, 0, c3, c1, c2);
+  sqr_add_c2(a, 4, 1, c3, c1, c2);
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  sqr_add_c2(a, 4, 2, c1, c2, c3);
+  sqr_add_c2(a, 5, 1, c1, c2, c3);
+  sqr_add_c2(a, 6, 0, c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 0, c2, c3, c1);
+  sqr_add_c2(a, 6, 1, c2, c3, c1);
+  sqr_add_c2(a, 5, 2, c2, c3, c1);
+  sqr_add_c2(a, 4, 3, c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  sqr_add_c(a, 4, c3, c1, c2);
+  sqr_add_c2(a, 5, 3, c3, c1, c2);
+  sqr_add_c2(a, 6, 2, c3, c1, c2);
+  sqr_add_c2(a, 7, 1, c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 7, 2, c1, c2, c3);
+  sqr_add_c2(a, 6, 3, c1, c2, c3);
+  sqr_add_c2(a, 5, 4, c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  sqr_add_c(a, 5, c2, c3, c1);
+  sqr_add_c2(a, 6, 4, c2, c3, c1);
+  sqr_add_c2(a, 7, 3, c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 7, 4, c3, c1, c2);
+  sqr_add_c2(a, 6, 5, c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  sqr_add_c(a, 6, c1, c2, c3);
+  sqr_add_c2(a, 7, 5, c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 6, c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  sqr_add_c(a, 7, c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+#undef mul_add
+#undef mul
+#undef sqr
+#undef mul_add_c
+#undef sqr_add_c
+#undef mul_add_c2
+#undef sqr_add_c2
+
+#endif  // !NO_ASM && X86_64 && (__GNUC__ || __clang__)
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn.c
@@ -0,0 +1,407 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../delocate.h"
+
+
+// BN_MAX_WORDS is the maximum number of words allowed in a |BIGNUM|. It is
+// sized so byte and bit counts of a |BIGNUM| always fit in |int|, with room to
+// spare.
+#define BN_MAX_WORDS (INT_MAX / (4 * BN_BITS2))
+
+BIGNUM *BN_new(void) {
+  BIGNUM *bn = OPENSSL_zalloc(sizeof(BIGNUM));
+
+  if (bn == NULL) {
+    return NULL;
+  }
+
+  bn->flags = BN_FLG_MALLOCED;
+
+  return bn;
+}
+
+BIGNUM *BN_secure_new(void) { return BN_new(); }
+
+void BN_init(BIGNUM *bn) {
+  OPENSSL_memset(bn, 0, sizeof(BIGNUM));
+}
+
+void BN_free(BIGNUM *bn) {
+  if (bn == NULL) {
+    return;
+  }
+
+  if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
+    OPENSSL_free(bn->d);
+  }
+
+  if (bn->flags & BN_FLG_MALLOCED) {
+    OPENSSL_free(bn);
+  } else {
+    bn->d = NULL;
+  }
+}
+
+void BN_clear_free(BIGNUM *bn) {
+  BN_free(bn);
+}
+
+BIGNUM *BN_dup(const BIGNUM *src) {
+  BIGNUM *copy;
+
+  if (src == NULL) {
+    return NULL;
+  }
+
+  copy = BN_new();
+  if (copy == NULL) {
+    return NULL;
+  }
+
+  if (!BN_copy(copy, src)) {
+    BN_free(copy);
+    return NULL;
+  }
+
+  return copy;
+}
+
+BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) {
+  if (src == dest) {
+    return dest;
+  }
+
+  if (!bn_wexpand(dest, src->width)) {
+    return NULL;
+  }
+
+  OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->width);
+
+  dest->width = src->width;
+  dest->neg = src->neg;
+  return dest;
+}
+
+void BN_clear(BIGNUM *bn) {
+  if (bn->d != NULL) {
+    OPENSSL_memset(bn->d, 0, bn->dmax * sizeof(bn->d[0]));
+  }
+
+  bn->width = 0;
+  bn->neg = 0;
+}
+
+DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) {
+  static const BN_ULONG kOneLimbs[1] = { 1 };
+  out->d = (BN_ULONG*) kOneLimbs;
+  out->width = 1;
+  out->dmax = 1;
+  out->neg = 0;
+  out->flags = BN_FLG_STATIC_DATA;
+}
+
+// BN_num_bits_word returns the minimum number of bits needed to represent the
+// value in |l|.
+unsigned BN_num_bits_word(BN_ULONG l) {
+  // |BN_num_bits| is often called on RSA prime factors. These have public bit
+  // lengths, but all bits beyond the high bit are secret, so count bits in
+  // constant time.
+  BN_ULONG x, mask;
+  int bits = (l != 0);
+
+#if BN_BITS2 > 32
+  // Look at the upper half of |x|. |x| is at most 64 bits long.
+  x = l >> 32;
+  // Set |mask| to all ones if |x| (the top 32 bits of |l|) is non-zero and all
+  // all zeros otherwise.
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  // If |x| is non-zero, the lower half is included in the bit count in full,
+  // and we count the upper half. Otherwise, we count the lower half.
+  bits += 32 & mask;
+  l ^= (x ^ l) & mask;  // |l| is |x| if |mask| and remains |l| otherwise.
+#endif
+
+  // The remaining blocks are analogous iterations at lower powers of two.
+  x = l >> 16;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 16 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 8;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 8 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 4;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 4 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 2;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 2 & mask;
+  l ^= (x ^ l) & mask;
+
+  x = l >> 1;
+  mask = 0u - x;
+  mask = (0u - (mask >> (BN_BITS2 - 1)));
+  bits += 1 & mask;
+
+  return bits;
+}
+
+// |BN_num_bits| and |BN_num_bytes| return ints in OpenSSL. In theory, a swap
+// from OpenSSL to AWS-LC is not type-safe. However, in practice, the bit length
+// should be representable by a int due to |BN_MAX_WORDS|. The maximum bit size
+// of a BIGNUM is right-shifted by 2. If bit-sizes can be
+// represented by an int, so the byte/word size.
+unsigned BN_num_bits(const BIGNUM *bn) {
+  const int width = bn_minimal_width(bn);
+  if (width == 0) {
+    return 0;
+  }
+
+  return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]);
+}
+
+unsigned BN_num_bytes(const BIGNUM *bn) {
+  return (BN_num_bits(bn) + 7) / 8;
+}
+
+// ibmtpm performs a direct check of output to a signed value. This won't work
+// if below returns an unsigned value. Hence, the int return type.
+int BN_get_minimal_width(const BIGNUM *bn) {
+  return bn_minimal_width(bn);
+}
+
+void BN_zero(BIGNUM *bn) {
+  bn->width = bn->neg = 0;
+}
+
+int BN_one(BIGNUM *bn) {
+  return BN_set_word(bn, 1);
+}
+
+int BN_set_word(BIGNUM *bn, BN_ULONG value) {
+  if (value == 0) {
+    BN_zero(bn);
+    return 1;
+  }
+
+  if (!bn_wexpand(bn, 1)) {
+    return 0;
+  }
+
+  bn->neg = 0;
+  bn->d[0] = value;
+  bn->width = 1;
+  return 1;
+}
+
+int BN_set_u64(BIGNUM *bn, uint64_t value) {
+#if BN_BITS2 == 64
+  return BN_set_word(bn, value);
+#elif BN_BITS2 == 32
+  if (value <= BN_MASK2) {
+    return BN_set_word(bn, (BN_ULONG)value);
+  }
+
+  if (!bn_wexpand(bn, 2)) {
+    return 0;
+  }
+
+  bn->neg = 0;
+  bn->d[0] = (BN_ULONG)value;
+  bn->d[1] = (BN_ULONG)(value >> 32);
+  bn->width = 2;
+  return 1;
+#else
+#error "BN_BITS2 must be 32 or 64."
+#endif
+}
+
+int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
+  if (!bn_wexpand(bn, num)) {
+    return 0;
+  }
+  OPENSSL_memmove(bn->d, words, num * sizeof(BN_ULONG));
+  // |bn_wexpand| verified that |num| isn't too large.
+  bn->width = (int)num;
+  bn->neg = 0;
+  return 1;
+}
+
+void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
+  if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
+    OPENSSL_free(bn->d);
+  }
+  bn->d = (BN_ULONG *)words;
+
+  assert(num <= BN_MAX_WORDS);
+  bn->width = (int)num;
+  bn->dmax = (int)num;
+  bn->neg = 0;
+  bn->flags |= BN_FLG_STATIC_DATA;
+}
+
+int bn_fits_in_words(const BIGNUM *bn, size_t num) {
+  // All words beyond |num| must be zero.
+  BN_ULONG mask = 0;
+  for (size_t i = num; i < (size_t)bn->width; i++) {
+    mask |= bn->d[i];
+  }
+  return mask == 0;
+}
+
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) {
+  if (bn->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  size_t width = (size_t)bn->width;
+  if (width > num) {
+    if (!bn_fits_in_words(bn, num)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+      return 0;
+    }
+    width = num;
+  }
+
+  OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num);
+  OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width);
+  return 1;
+}
+
+int BN_is_negative(const BIGNUM *bn) {
+  return bn->neg != 0;
+}
+
+void BN_set_negative(BIGNUM *bn, int sign) {
+  if (sign && !BN_is_zero(bn)) {
+    bn->neg = 1;
+  } else {
+    bn->neg = 0;
+  }
+}
+
+int bn_wexpand(BIGNUM *bn, size_t words) {
+  BN_ULONG *a;
+
+  if (words <= (size_t)bn->dmax) {
+    return 1;
+  }
+
+  if (words > BN_MAX_WORDS) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+
+  if (bn->flags & BN_FLG_STATIC_DATA) {
+    OPENSSL_PUT_ERROR(BN, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
+    return 0;
+  }
+
+  a = OPENSSL_calloc(words, sizeof(BN_ULONG));
+  if (a == NULL) {
+    return 0;
+  }
+
+  OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->width);
+
+  OPENSSL_free(bn->d);
+  bn->d = a;
+  bn->dmax = (int)words;
+
+  return 1;
+}
+
+int bn_expand(BIGNUM *bn, size_t bits) {
+  if (bits + BN_BITS2 - 1 < bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+  return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
+}
+
+int bn_resize_words(BIGNUM *bn, size_t words) {
+#if (defined(OPENSSL_PPC64LE) || defined(OPENSSL_PPC64BE)) && defined(__clang__) && __clang_major__ < 10
+  // This is a workaround for a miscompilation bug in Clang 7.0.1 on POWER.
+  // The unittests catch the miscompilation, if it occurs, and it manifests
+  // as a crash in |bn_fits_in_words|.
+  //
+  // The bug only triggers if building in FIPS mode and with -O3. Clang 8.0.1
+  // has the same bug but this workaround is not effective there---I've not
+  // been able to find a workaround for 8.0.1.
+  //
+  // At the time of writing (2019-08-08), Clang git does *not* have this bug
+  // and does not need this workaroud. The current git version should go on to
+  // be Clang 10 thus, once we can depend on that, this can be removed.
+  if (value_barrier_w((size_t)bn->width == words)) {
+    return 1;
+  }
+#endif
+
+  if ((size_t)bn->width <= words) {
+    if (!bn_wexpand(bn, words)) {
+      return 0;
+    }
+    OPENSSL_memset(bn->d + bn->width, 0,
+                   (words - bn->width) * sizeof(BN_ULONG));
+    bn->width = (int)words;
+    return 1;
+  }
+
+  // All words beyond the new width must be zero.
+  if (!bn_fits_in_words(bn, words)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+  bn->width = (int)words;
+  return 1;
+}
+
+void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
+                     const BN_ULONG *b, size_t num) {
+  for (size_t i = 0; i < num; i++) {
+    OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                          crypto_word_t_is_too_small)
+    r[i] = constant_time_select_w(mask, a[i], b[i]);
+  }
+}
+
+int bn_minimal_width(const BIGNUM *bn) {
+  int ret = bn->width;
+  while (ret > 0 && bn->d[ret - 1] == 0) {
+    ret--;
+  }
+  return ret;
+}
+
+void bn_set_minimal_width(BIGNUM *bn) {
+  bn->width = bn_minimal_width(bn);
+  if (bn->width == 0) {
+    bn->neg = 0;
+  }
+}
+
+int BN_get_flags(const BIGNUM *bn, int flags) {
+  return bn->flags & flags;
+}
+
+void BN_set_flags(BIGNUM *b, int n) { }
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn_assert_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn_assert_test.cc
@@ -0,0 +1,76 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+#include <openssl/bn.h>
+#include <openssl/rand.h>
+#include "./internal.h"
+
+#include <gtest/gtest.h>
+
+TEST(BNAssertTest, Assert_fits_in_bytes_large) {
+// TODO: Update Android test harness
+#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
+  bssl::UniquePtr<BIGNUM> x(BN_new());
+  uint8_t input[255];
+  OPENSSL_memset(input, 0, sizeof(input));
+  input[0] = 0xaa;
+  input[1] = 0x01;
+  input[254] = 0x01;
+  ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
+  for (size_t i = 255; i < 260; i++) {
+    bn_assert_fits_in_bytes(x.get(), i);
+  }
+  for (size_t i = 247; i < 255; i++) {
+    EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
+  }
+#endif
+}
+
+TEST(BNAssertTest, Assert_fits_in_bytes_small) {
+#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
+  bssl::UniquePtr<BIGNUM> x(BN_new());
+  uint8_t input[8];
+  OPENSSL_memset(input, 0, sizeof(input));
+  input[0] = 0xaa;
+  input[1] = 0xbb;
+  input[2] = 0xcc;
+  ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
+  for (size_t i = 3; i < 10; i++) {
+    bn_assert_fits_in_bytes(x.get(), i);
+  }
+  for (size_t i = 0; i < 3; i++) {
+    EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
+  }
+#endif
+}
+
+TEST(BNAssertTest, Assert_fits_in_bytes_zero) {
+#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
+  bssl::UniquePtr<BIGNUM> x(BN_new());
+  uint8_t input[8];
+  OPENSSL_memset(input, 0, sizeof(input));
+  ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
+
+  for (size_t i = 0; i < 10; i++) {
+    bn_assert_fits_in_bytes(x.get(), i);
+  }
+#endif
+}
+
+TEST(BNAssertTest, Assert_fits_in_bytes_boundary) {
+#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
+  bssl::UniquePtr<BIGNUM> x(BN_new());
+  uint8_t input[8];
+  OPENSSL_memset(input, 0, sizeof(input));
+  for (size_t i = 0; i < sizeof(input); i++) {
+    input[i] = i * (i + 1) & 0xff;
+  }
+  ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
+  for (size_t i = 8; i < 18; i++) {
+    bn_assert_fits_in_bytes(x.get(), i);
+  }
+  for (size_t i = 0; i < 8; i++) {
+    EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
+  }
+#endif
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bn_test.cc
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bytes.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/bytes.c
@@ -0,0 +1,292 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <limits.h>
+
+#include "internal.h"
+
+void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
+                            size_t in_len) {
+  // The caller should have sized |out| to fit |in| without truncating. This
+  // condition ensures we do not overflow |out|, so use a runtime check.
+  BSSL_CHECK(in_len <= out_len * sizeof(BN_ULONG));
+
+  // Load whole words.
+  while (in_len >= sizeof(BN_ULONG)) {
+    in_len -= sizeof(BN_ULONG);
+    out[0] = CRYPTO_load_word_be(in + in_len);
+    out++;
+    out_len--;
+  }
+
+  // Load the last partial word.
+  if (in_len != 0) {
+    BN_ULONG word = 0;
+    for (size_t i = 0; i < in_len; i++) {
+      word = (word << 8) | in[i];
+    }
+    out[0] = word;
+    out++;
+    out_len--;
+  }
+
+  // Fill the remainder with zeros.
+  OPENSSL_memset(out, 0, out_len * sizeof(BN_ULONG));
+}
+
+BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
+  BIGNUM *bn = NULL;
+  if (ret == NULL) {
+    bn = BN_new();
+    if (bn == NULL) {
+      return NULL;
+    }
+    ret = bn;
+  }
+
+  if (len == 0) {
+    ret->width = 0;
+    return ret;
+  }
+
+  size_t num_words = ((len - 1) / BN_BYTES) + 1;
+  if (!bn_wexpand(ret, num_words)) {
+    BN_free(bn);
+    return NULL;
+  }
+
+  // |bn_wexpand| must check bounds on |num_words| to write it into
+  // |ret->dmax|.
+  assert(num_words <= INT_MAX);
+  ret->width = (int)num_words;
+  ret->neg = 0;
+
+  bn_big_endian_to_words(ret->d, ret->width, in, len);
+  return ret;
+}
+
+BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
+  BIGNUM *bn = NULL;
+  if (ret == NULL) {
+    bn = BN_new();
+    if (bn == NULL) {
+      return NULL;
+    }
+    ret = bn;
+  }
+
+  if (len == 0) {
+    ret->width = 0;
+    ret->neg = 0;
+    return ret;
+  }
+
+  // Reserve enough space in |ret|.
+  size_t num_words = ((len - 1) / BN_BYTES) + 1;
+  if (!bn_wexpand(ret, num_words)) {
+    BN_free(bn);
+    return NULL;
+  }
+  ret->width = (int)num_words;
+
+  bn_little_endian_to_words(ret->d, ret->width, in, len);
+
+  return ret;
+}
+
+void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, const size_t in_len) {
+  assert(out_len > 0);
+#ifdef OPENSSL_BIG_ENDIAN
+  size_t in_index = 0;
+  for (size_t i = 0; i < out_len; i++) {
+    if ((in_len-in_index) < sizeof(BN_ULONG)) {
+      // Load the last partial word.
+      BN_ULONG word = 0;
+      // size_t is unsigned, so j >= 0 is always true.
+      for (size_t j = in_len-1; j >= in_index && j < in_len; j--) {
+        word = (word << 8) | in[j];
+      }
+      in_index = in_len;
+      out[i] = word;
+
+      // Fill the remainder with zeros.
+      OPENSSL_memset(out + i + 1, 0, (out_len - i - 1) * sizeof(BN_ULONG));
+      break;
+    }
+
+    out[i] = CRYPTO_load_word_le(in + in_index);
+    in_index += sizeof(BN_ULONG);
+  }
+
+  // The caller should have sized the output to avoid truncation.
+  assert(in_index == in_len);
+#else
+  OPENSSL_memcpy(out, in, in_len);
+  // Fill the remainder with zeros.
+  OPENSSL_memset( ((uint8_t*)out) + in_len, 0, sizeof(BN_ULONG)*out_len - in_len);
+#endif
+}
+
+// fits_in_bytes returns one if the |num_words| words in |words| can be
+// represented in |num_bytes| bytes.
+static int fits_in_bytes(const BN_ULONG *words, size_t num_words,
+                         size_t num_bytes) {
+  uint8_t mask = 0;
+#ifdef OPENSSL_BIG_ENDIAN
+  for (size_t i = num_bytes / BN_BYTES; i < num_words; i++) {
+    BN_ULONG word = words[i];
+    for (size_t j = 0; j < BN_BYTES; j++) {
+      if ((i * BN_BYTES) + j < num_bytes) {
+        // For the first word we don't need to check any bytes shorter than len
+        continue ;
+      } else {
+        mask |= (word >> (j * 8)) & 0xff;
+      }
+    }
+  }
+#else
+  const uint8_t *bytes = (const uint8_t *)words;
+  size_t tot_bytes = num_words * sizeof(BN_ULONG);
+  for (size_t i = num_bytes; i < tot_bytes; i++) {
+    mask |= bytes[i];
+  }
+#endif
+  return mask == 0;
+}
+
+// Asserts that the BIGNUM can be represented within |num| bytes.
+// The logic is consistent with `fits_in_bytes` but assertions will fail when false.
+void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num) {
+  const uint8_t *bytes = (const uint8_t *)bn->d;
+  size_t tot_bytes = bn->width * sizeof(BN_ULONG);
+  if (tot_bytes > num) {
+    CONSTTIME_DECLASSIFY(bytes + num, tot_bytes - num);
+// Avoids compiler error: unused variable 'byte' or 'word'
+// The assert statements below are only effective in DEBUG builds
+#ifndef NDEBUG
+#ifdef OPENSSL_BIG_ENDIAN
+    for (int i = num / BN_BYTES; i < bn->width; i++) {
+      BN_ULONG word = bn->d[i];
+      for (size_t j = 0; j < BN_BYTES; j++) {
+        if ((i * BN_BYTES) + j < num) {
+          // For the first word we don't need to check any bytes shorter than len
+          continue;
+        } else {
+          uint8_t byte = (word >> (j * 8)) & 0xff;
+          assert(byte == 0);
+        }
+      }
+    }
+#else
+    for (size_t i = num; i < tot_bytes; i++) {
+      assert(bytes[i] == 0);
+    }
+#endif
+#endif
+    (void)bytes;
+  }
+}
+
+void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
+                            size_t in_len) {
+  // The caller should have selected an output length without truncation.
+  declassify_assert(fits_in_bytes(in, in_len, out_len));
+  size_t num_bytes = in_len * sizeof(BN_ULONG);
+  if (out_len < num_bytes) {
+    num_bytes = out_len;
+  }
+
+#ifdef OPENSSL_BIG_ENDIAN
+  for (size_t i = 0; i < num_bytes; i++) {
+    BN_ULONG l = in[i / BN_BYTES];
+    out[out_len - i - 1] = (uint8_t)(l >> (8 * (i % BN_BYTES))) & 0xff;
+  }
+#else
+  const uint8_t *bytes = (const uint8_t *)in;
+  for (size_t i = 0; i < num_bytes; i++) {
+    out[out_len - i - 1] = bytes[i];
+  }
+#endif
+  // Pad out the rest of the buffer with zeroes.
+  OPENSSL_memset(out, 0, out_len - num_bytes);
+}
+
+size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
+  size_t n = BN_num_bytes(in);
+  bn_words_to_big_endian(out, n, in->d, in->width);
+  return n;
+}
+
+void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len) {
+  // The caller should have selected an output length without truncation.
+  assert(fits_in_bytes(in, in_len, out_len));
+  size_t num_bytes = in_len * sizeof(BN_ULONG);
+  if (out_len < num_bytes) {
+    num_bytes = out_len;
+  }
+#ifdef OPENSSL_BIG_ENDIAN
+  size_t byte_idx = 0;
+  for (size_t word_idx = 0; word_idx < in_len; word_idx++) {
+    BN_ULONG l = in[word_idx];
+    for(size_t j = 0; j < BN_BYTES && byte_idx < num_bytes; j++) {
+      out[byte_idx] = (uint8_t)(l & 0xff);
+      l >>= 8;
+      byte_idx++;
+    }
+  }
+#else
+  const uint8_t *bytes = (const uint8_t *)in;
+  OPENSSL_memcpy(out, bytes, num_bytes);
+#endif
+  // Fill the remainder with zeros.
+  OPENSSL_memset(out + num_bytes, 0, out_len - num_bytes);
+}
+
+int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
+  if (!fits_in_bytes(in->d, in->width, len)) {
+    return 0;
+  }
+  bn_words_to_little_endian(out, len, in->d, in->width);
+  return 1;
+}
+
+int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
+  if (!fits_in_bytes(in->d, in->width, len)) {
+    return 0;
+  }
+
+  bn_words_to_big_endian(out, len, in->d, in->width);
+  return 1;
+}
+
+BN_ULONG BN_get_word(const BIGNUM *bn) {
+  switch (bn_minimal_width(bn)) {
+    case 0:
+      return 0;
+    case 1:
+      return bn->d[0];
+    default:
+      return BN_MASK2;
+  }
+}
+
+int BN_get_u64(const BIGNUM *bn, uint64_t *out) {
+  switch (bn_minimal_width(bn)) {
+    case 0:
+      *out = 0;
+      return 1;
+    case 1:
+      *out = bn->d[0];
+      return 1;
+#if defined(OPENSSL_32_BIT)
+    case 2:
+      *out = (uint64_t) bn->d[0] | (((uint64_t) bn->d[1]) << 32);
+      return 1;
+#endif
+    default:
+      return 0;
+  }
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/cmp.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/cmp.c
@@ -0,0 +1,147 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <openssl/mem.h>
+#include <openssl/type_check.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+static int bn_cmp_words_consttime(const BN_ULONG *a, size_t a_len,
+                                  const BN_ULONG *b, size_t b_len) {
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  int ret = 0;
+  // Process the common words in little-endian order.
+  size_t min = a_len < b_len ? a_len : b_len;
+  for (size_t i = 0; i < min; i++) {
+    crypto_word_t eq = constant_time_eq_w(a[i], b[i]);
+    crypto_word_t lt = constant_time_lt_w(a[i], b[i]);
+    ret =
+        constant_time_select_int(eq, ret, constant_time_select_int(lt, -1, 1));
+  }
+
+  // If |a| or |b| has non-zero words beyond |min|, they take precedence.
+  if (a_len < b_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = a_len; i < b_len; i++) {
+      mask |= b[i];
+    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, -1);
+  } else if (b_len < a_len) {
+    crypto_word_t mask = 0;
+    for (size_t i = b_len; i < a_len; i++) {
+      mask |= a[i];
+    }
+    ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1);
+  }
+
+  return ret;
+}
+
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
+  return bn_cmp_words_consttime(a->d, a->width, b->d, b->width);
+}
+
+int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
+  if ((a == NULL) || (b == NULL)) {
+    if (a != NULL) {
+      return -1;
+    } else if (b != NULL) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  // We do not attempt to process the sign bit in constant time. Negative
+  // |BIGNUM|s should never occur in crypto, only calculators.
+  if (a->neg != b->neg) {
+    if (a->neg) {
+      return -1;
+    }
+    return 1;
+  }
+
+  int ret = BN_ucmp(a, b);
+  return a->neg ? -ret : ret;
+}
+
+int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) {
+  return bn_cmp_words_consttime(a, len, b, len) < 0;
+}
+
+int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
+  if (bn->width == 0) {
+    return w == 0;
+  }
+  BN_ULONG mask = bn->d[0] ^ w;
+  for (int i = 1; i < bn->width; i++) {
+    mask |= bn->d[i];
+  }
+  return mask == 0;
+}
+
+int BN_cmp_word(const BIGNUM *a, BN_ULONG b) {
+  BIGNUM b_bn;
+  BN_init(&b_bn);
+
+  b_bn.d = &b;
+  b_bn.width = b > 0;
+  b_bn.dmax = 1;
+  b_bn.flags = BN_FLG_STATIC_DATA;
+  return BN_cmp(a, &b_bn);
+}
+
+int BN_is_zero(const BIGNUM *bn) {
+  return bn_fits_in_words(bn, 0);
+}
+
+int BN_is_one(const BIGNUM *bn) {
+  return bn->neg == 0 && BN_abs_is_word(bn, 1);
+}
+
+int BN_is_word(const BIGNUM *bn, BN_ULONG w) {
+  return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0);
+}
+
+int BN_is_odd(const BIGNUM *bn) {
+  return bn->width > 0 && (bn->d[0] & 1) == 1;
+}
+
+int BN_is_pow2(const BIGNUM *bn) {
+  int width = bn_minimal_width(bn);
+  if (width == 0 || bn->neg) {
+    return 0;
+  }
+
+  for (int i = 0; i < width - 1; i++) {
+    if (bn->d[i] != 0) {
+      return 0;
+    }
+  }
+
+  return 0 == (bn->d[width-1] & (bn->d[width-1] - 1));
+}
+
+int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) {
+  BN_ULONG mask = 0;
+  // If |a| or |b| has more words than the other, all those words must be zero.
+  for (int i = a->width; i < b->width; i++) {
+    mask |= b->d[i];
+  }
+  for (int i = b->width; i < a->width; i++) {
+    mask |= a->d[i];
+  }
+  // Common words must match.
+  int min = a->width < b->width ? a->width : b->width;
+  for (int i = 0; i < min; i++) {
+    mask |= (a->d[i] ^ b->d[i]);
+  }
+  // The sign bit must match.
+  mask |= (a->neg ^ b->neg);
+  return mask == 0;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/ctx.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/ctx.c
@@ -0,0 +1,182 @@
+// Written by Ulf Moeller for the OpenSSL project.
+// Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+
+
+// The stack frame info is resizing, set a first-time expansion size;
+#define BN_CTX_START_FRAMES 32
+
+
+// BN_STACK
+
+// A |BN_STACK| is a stack of |size_t| values.
+typedef struct {
+  // Array of indexes into |ctx->bignums|.
+  size_t *indexes;
+  // Number of stack frames, and the size of the allocated array
+  size_t depth, size;
+} BN_STACK;
+
+static void BN_STACK_init(BN_STACK *);
+static void BN_STACK_cleanup(BN_STACK *);
+static int BN_STACK_push(BN_STACK *, size_t idx);
+static size_t BN_STACK_pop(BN_STACK *);
+
+
+// BN_CTX
+
+DEFINE_STACK_OF(BIGNUM)
+
+// The opaque BN_CTX type
+struct bignum_ctx {
+  // bignums is the stack of |BIGNUM|s managed by this |BN_CTX|.
+  STACK_OF(BIGNUM) *bignums;
+  // stack is the stack of |BN_CTX_start| frames. It is the value of |used| at
+  // the time |BN_CTX_start| was called.
+  BN_STACK stack;
+  // used is the number of |BIGNUM|s from |bignums| that have been used.
+  size_t used;
+  // error is one if any operation on this |BN_CTX| failed. All subsequent
+  // operations will fail.
+  char error;
+  // defer_error is one if an operation on this |BN_CTX| has failed, but no
+  // error has been pushed to the queue yet. This is used to defer errors from
+  // |BN_CTX_start| to |BN_CTX_get|.
+  char defer_error;
+};
+
+BN_CTX *BN_CTX_new(void) {
+  BN_CTX *ret = OPENSSL_zalloc(sizeof(BN_CTX));
+  if (!ret) {
+    return NULL;
+  }
+
+  // Initialise the structure
+  BN_STACK_init(&ret->stack);
+  return ret;
+}
+
+BN_CTX *BN_CTX_secure_new(void) { return BN_CTX_new(); }
+
+void BN_CTX_free(BN_CTX *ctx) {
+  if (ctx == NULL) {
+    return;
+  }
+
+  // All |BN_CTX_start| calls must be matched with |BN_CTX_end|, otherwise the
+  // function may use more memory than expected, potentially without bound if
+  // done in a loop. Assert that all |BIGNUM|s have been released.
+  assert(ctx->used == 0 || ctx->error);
+  sk_BIGNUM_pop_free(ctx->bignums, BN_free);
+  BN_STACK_cleanup(&ctx->stack);
+  OPENSSL_free(ctx);
+}
+
+void BN_CTX_start(BN_CTX *ctx) {
+  if (ctx->error) {
+    // Once an operation has failed, |ctx->stack| no longer matches the number
+    // of |BN_CTX_end| calls to come. Do nothing.
+    return;
+  }
+
+  if (!BN_STACK_push(&ctx->stack, ctx->used)) {
+    ctx->error = 1;
+    // |BN_CTX_start| cannot fail, so defer the error to |BN_CTX_get|.
+    ctx->defer_error = 1;
+  }
+}
+
+BIGNUM *BN_CTX_get(BN_CTX *ctx) {
+  // Once any operation has failed, they all do.
+  if (ctx->error) {
+    if (ctx->defer_error) {
+      OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+      ctx->defer_error = 0;
+    }
+    return NULL;
+  }
+
+  if (ctx->bignums == NULL) {
+    ctx->bignums = sk_BIGNUM_new_null();
+    if (ctx->bignums == NULL) {
+      ctx->error = 1;
+      return NULL;
+    }
+  }
+
+  if (ctx->used == sk_BIGNUM_num(ctx->bignums)) {
+    BIGNUM *bn = BN_new();
+    if (bn == NULL || !sk_BIGNUM_push(ctx->bignums, bn)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+      BN_free(bn);
+      ctx->error = 1;
+      return NULL;
+    }
+  }
+
+  BIGNUM *ret = sk_BIGNUM_value(ctx->bignums, ctx->used);
+  BN_zero(ret);
+  // This is bounded by |sk_BIGNUM_num|, so it cannot overflow.
+  ctx->used++;
+  return ret;
+}
+
+void BN_CTX_end(BN_CTX *ctx) {
+  if (ctx->error) {
+    // Once an operation has failed, |ctx->stack| no longer matches the number
+    // of |BN_CTX_end| calls to come. Do nothing.
+    return;
+  }
+
+  ctx->used = BN_STACK_pop(&ctx->stack);
+}
+
+
+// BN_STACK
+
+static void BN_STACK_init(BN_STACK *st) {
+  st->indexes = NULL;
+  st->depth = st->size = 0;
+}
+
+static void BN_STACK_cleanup(BN_STACK *st) {
+  OPENSSL_free(st->indexes);
+}
+
+static int BN_STACK_push(BN_STACK *st, size_t idx) {
+  if (st->depth == st->size) {
+    // This function intentionally does not push to the error queue on error.
+    // Error-reporting is deferred to |BN_CTX_get|.
+    size_t new_size = st->size != 0 ? st->size * 3 / 2 : BN_CTX_START_FRAMES;
+    if (new_size <= st->size || new_size > SIZE_MAX / sizeof(size_t)) {
+      return 0;
+    }
+    size_t *new_indexes =
+        OPENSSL_realloc(st->indexes, new_size * sizeof(size_t));
+    if (new_indexes == NULL) {
+      return 0;
+    }
+    st->indexes = new_indexes;
+    st->size = new_size;
+  }
+
+  st->indexes[st->depth] = idx;
+  st->depth++;
+  return 1;
+}
+
+static size_t BN_STACK_pop(BN_STACK *st) {
+  assert(st->depth > 0);
+  st->depth--;
+  return st->indexes[st->depth];
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/div.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/div.c
@@ -0,0 +1,856 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <limits.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+// bn_div_words divides a double-width |h|,|l| by |d| and returns the result,
+// which must fit in a |BN_ULONG|.
+OPENSSL_UNUSED static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l,
+                                            BN_ULONG d) {
+  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
+  int i, count = 2;
+
+  if (d == 0) {
+    return BN_MASK2;
+  }
+
+  i = BN_num_bits_word(d);
+  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
+
+  i = BN_BITS2 - i;
+  if (h >= d) {
+    h -= d;
+  }
+
+  if (i) {
+    d <<= i;
+    h = (h << i) | (l >> (BN_BITS2 - i));
+    l <<= i;
+  }
+  dh = (d & BN_MASK2h) >> BN_BITS4;
+  dl = (d & BN_MASK2l);
+  for (;;) {
+    if ((h >> BN_BITS4) == dh) {
+      q = BN_MASK2l;
+    } else {
+      q = h / dh;
+    }
+
+    th = q * dh;
+    tl = dl * q;
+    for (;;) {
+      t = h - th;
+      if ((t & BN_MASK2h) ||
+          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
+        break;
+      }
+      q--;
+      th -= dh;
+      tl -= dl;
+    }
+    t = (tl >> BN_BITS4);
+    tl = (tl << BN_BITS4) & BN_MASK2h;
+    th += t;
+
+    if (l < tl) {
+      th++;
+    }
+    l -= tl;
+    if (h < th) {
+      h += d;
+      q--;
+    }
+    h -= th;
+
+    if (--count == 0) {
+      break;
+    }
+
+    ret = q << BN_BITS4;
+    h = (h << BN_BITS4) | (l >> BN_BITS4);
+    l = (l & BN_MASK2l) << BN_BITS4;
+  }
+
+  ret |= q;
+  return ret;
+}
+
+static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
+                                    BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
+  // GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when
+  // the |BN_ULLONG|-based C code is used.
+  //
+  // GCC bugs:
+  //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+  //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721
+  //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183
+  //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897
+  //   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
+  //
+  // Clang bugs:
+  //   * https://llvm.org/bugs/show_bug.cgi?id=6397
+  //   * https://llvm.org/bugs/show_bug.cgi?id=12418
+  //
+  // These issues aren't specific to x86 and x86_64, so it might be worthwhile
+  // to add more assembly language implementations.
+#if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86)
+  __asm__ volatile("divl %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
+#elif defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86_64)
+  __asm__ volatile("divq %4"
+                   : "=a"(*quotient_out), "=d"(*rem_out)
+                   : "a"(n1), "d"(n0), "rm"(d0)
+                   : "cc");
+#else
+#if defined(BN_CAN_DIVIDE_ULLONG)
+  BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
+  *quotient_out = (BN_ULONG)(n / d0);
+#else
+  *quotient_out = bn_div_words(n0, n1, d0);
+#endif
+  *rem_out = n1 - (*quotient_out * d0);
+#endif
+}
+
+// BN_div computes "quotient := numerator / divisor", rounding towards zero,
+// and sets up |rem| such that "quotient * divisor + rem = numerator" holds.
+//
+// Thus:
+//
+//     quotient->neg == numerator->neg ^ divisor->neg
+//        (unless the result is zero)
+//     rem->neg == numerator->neg
+//        (unless the remainder is zero)
+//
+// If |quotient| or |rem| is NULL, the respective value is not returned.
+//
+// This was specifically designed to contain fewer branches that may leak
+// sensitive information; see "New Branch Prediction Vulnerabilities in OpenSSL
+// and Necessary Software Countermeasures" by Onur Acıçmez, Shay Gueron, and
+// Jean-Pierre Seifert.
+int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
+           const BIGNUM *divisor, BN_CTX *ctx) {
+  int norm_shift, loop;
+  BIGNUM wnum;
+  BN_ULONG *resp, *wnump;
+  BN_ULONG d0, d1;
+  int num_n, div_n;
+
+  // This function relies on the historical minimal-width |BIGNUM| invariant.
+  // It is already not constant-time (constant-time reductions should use
+  // Montgomery logic), so we shrink all inputs and intermediate values to
+  // retain the previous behavior.
+
+  // Invalid zero-padding would have particularly bad consequences.
+  int numerator_width = bn_minimal_width(numerator);
+  int divisor_width = bn_minimal_width(divisor);
+  if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) ||
+      (divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED);
+    return 0;
+  }
+
+  if (BN_is_zero(divisor)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
+    return 0;
+  }
+
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  BIGNUM *snum = BN_CTX_get(ctx);
+  BIGNUM *sdiv = BN_CTX_get(ctx);
+  BIGNUM *res = NULL;
+  if (quotient == NULL) {
+    res = BN_CTX_get(ctx);
+  } else {
+    res = quotient;
+  }
+  if (sdiv == NULL || res == NULL) {
+    goto err;
+  }
+
+  // First we normalise the numbers
+  norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2);
+  if (!BN_lshift(sdiv, divisor, norm_shift)) {
+    goto err;
+  }
+  bn_set_minimal_width(sdiv);
+  sdiv->neg = 0;
+  norm_shift += BN_BITS2;
+  if (!BN_lshift(snum, numerator, norm_shift)) {
+    goto err;
+  }
+  bn_set_minimal_width(snum);
+  snum->neg = 0;
+
+  // Since we don't want to have special-case logic for the case where snum is
+  // larger than sdiv, we pad snum with enough zeroes without changing its
+  // value.
+  if (snum->width <= sdiv->width + 1) {
+    if (!bn_wexpand(snum, sdiv->width + 2)) {
+      goto err;
+    }
+    for (int i = snum->width; i < sdiv->width + 2; i++) {
+      snum->d[i] = 0;
+    }
+    snum->width = sdiv->width + 2;
+  } else {
+    if (!bn_wexpand(snum, snum->width + 1)) {
+      goto err;
+    }
+    snum->d[snum->width] = 0;
+    snum->width++;
+  }
+
+  div_n = sdiv->width;
+  num_n = snum->width;
+  loop = num_n - div_n;
+  // Lets setup a 'window' into snum
+  // This is the part that corresponds to the current
+  // 'area' being divided
+  wnum.neg = 0;
+  wnum.d = &(snum->d[loop]);
+  wnum.width = div_n;
+  // only needed when BN_ucmp messes up the values between width and max
+  wnum.dmax = snum->dmax - loop;  // so we don't step out of bounds
+
+  // Get the top 2 words of sdiv
+  // div_n=sdiv->width;
+  d0 = sdiv->d[div_n - 1];
+  d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
+
+  // pointer to the 'top' of snum
+  wnump = &(snum->d[num_n - 1]);
+
+  // Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg|
+  // for later.
+  const int numerator_neg = numerator->neg;
+  res->neg = (numerator_neg ^ divisor->neg);
+  if (!bn_wexpand(res, loop + 1)) {
+    goto err;
+  }
+  res->width = loop - 1;
+  resp = &(res->d[loop - 1]);
+
+  // space for temp
+  if (!bn_wexpand(tmp, div_n + 1)) {
+    goto err;
+  }
+
+  // if res->width == 0 then clear the neg value otherwise decrease
+  // the resp pointer
+  if (res->width == 0) {
+    res->neg = 0;
+  } else {
+    resp--;
+  }
+
+  for (int i = 0; i < loop - 1; i++, wnump--, resp--) {
+    BN_ULONG q, l0;
+    // the first part of the loop uses the top two words of snum and sdiv to
+    // calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
+    BN_ULONG n0, n1, rm = 0;
+
+    n0 = wnump[0];
+    n1 = wnump[-1];
+    if (n0 == d0) {
+      q = BN_MASK2;
+    } else {
+      // n0 < d0
+      bn_div_rem_words(&q, &rm, n0, n1, d0);
+
+#ifdef BN_ULLONG
+      BN_ULLONG t2 = (BN_ULLONG)d1 * q;
+      for (;;) {
+        if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) {
+          break;
+        }
+        q--;
+        rm += d0;
+        if (rm < d0) {
+          break;  // don't let rm overflow
+        }
+        t2 -= d1;
+      }
+#else  // !BN_ULLONG
+      BN_ULONG t2l, t2h;
+      BN_UMULT_LOHI(t2l, t2h, d1, q);
+      for (;;) {
+        if (t2h < rm ||
+            (t2h == rm && t2l <= wnump[-2])) {
+          break;
+        }
+        q--;
+        rm += d0;
+        if (rm < d0) {
+          break;  // don't let rm overflow
+        }
+        if (t2l < d1) {
+          t2h--;
+        }
+        t2l -= d1;
+      }
+#endif  // !BN_ULLONG
+    }
+
+    l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+    tmp->d[div_n] = l0;
+    wnum.d--;
+    // ingore top values of the bignums just sub the two
+    // BN_ULONG arrays with bn_sub_words
+    if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
+      // Note: As we have considered only the leading
+      // two BN_ULONGs in the calculation of q, sdiv * q
+      // might be greater than wnum (but then (q-1) * sdiv
+      // is less or equal than wnum)
+      q--;
+      if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) {
+        // we can't have an overflow here (assuming
+        // that q != 0, but if q == 0 then tmp is
+        // zero anyway)
+        (*wnump)++;
+      }
+    }
+    // store part of the result
+    *resp = q;
+  }
+
+  bn_set_minimal_width(snum);
+
+  if (rem != NULL) {
+    if (!BN_rshift(rem, snum, norm_shift)) {
+      goto err;
+    }
+    if (!BN_is_zero(rem)) {
+      rem->neg = numerator_neg;
+    }
+  }
+
+  bn_set_minimal_width(res);
+  BN_CTX_end(ctx);
+  return 1;
+
+err:
+  BN_CTX_end(ctx);
+  return 0;
+}
+
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
+  if (!(BN_mod(r, m, d, ctx))) {
+    return 0;
+  }
+  if (!r->neg) {
+    return 1;
+  }
+
+  // now -|d| < r < 0, so we have to set r := r + |d|.
+  return (d->neg ? BN_sub : BN_add)(r, r, d);
+}
+
+BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
+                        const BN_ULONG *m, size_t num) {
+  assert(r != a);
+  // |r| = |a| - |m|. |bn_sub_words| performs the bulk of the subtraction, and
+  // then we apply the borrow to |carry|.
+  carry -= bn_sub_words(r, a, m, num);
+  // We know 0 <= |a| < 2*|m|, so -|m| <= |r| < |m|.
+  //
+  // If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then
+  // wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to
+  // return |r| + |m|, or |a|. |carry| must then be -1 or all ones. In both
+  // cases, |carry| is a suitable input to |bn_select_words|.
+  //
+  // Although |carry| may be one if it was one on input and |bn_sub_words|
+  // returns zero, this would give |r| > |m|, violating our input assumptions.
+  declassify_assert(carry + 1 <= 1);
+  bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num);
+  return carry;
+}
+
+BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
+                                 BN_ULONG *tmp, size_t num) {
+  // See |bn_reduce_once| for why this logic works.
+  carry -= bn_sub_words(tmp, r, m, num);
+  declassify_assert(carry + 1 <= 1);
+  bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
+  return carry;
+}
+
+void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  // r = a - b
+  BN_ULONG borrow = bn_sub_words(r, a, b, num);
+  // tmp = a - b + m
+  bn_add_words(tmp, r, m, num);
+  bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num);
+}
+
+void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
+  BN_ULONG carry = bn_add_words(r, a, b, num);
+  bn_reduce_once_in_place(r, carry, m, tmp, num);
+}
+
+int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
+                     const BIGNUM *numerator, const BIGNUM *divisor,
+                     unsigned divisor_min_bits, BN_CTX *ctx) {
+  if (BN_is_negative(numerator) || BN_is_negative(divisor)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+  if (BN_is_zero(divisor)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
+    return 0;
+  }
+
+  // This function implements long division in binary. It is not very efficient,
+  // but it is simple, easy to make constant-time, and performant enough for RSA
+  // key generation.
+
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *q = quotient, *r = remainder;
+  if (quotient == NULL || quotient == numerator || quotient == divisor) {
+    q = BN_CTX_get(ctx);
+  }
+  if (remainder == NULL || remainder == numerator || remainder == divisor) {
+    r = BN_CTX_get(ctx);
+  }
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (q == NULL || r == NULL || tmp == NULL ||
+      !bn_wexpand(q, numerator->width) ||
+      !bn_wexpand(r, divisor->width) ||
+      !bn_wexpand(tmp, divisor->width)) {
+    goto err;
+  }
+
+  OPENSSL_memset(q->d, 0, numerator->width * sizeof(BN_ULONG));
+  q->width = numerator->width;
+  q->neg = 0;
+
+  OPENSSL_memset(r->d, 0, divisor->width * sizeof(BN_ULONG));
+  r->width = divisor->width;
+  r->neg = 0;
+
+  // Incorporate |numerator| into |r|, one bit at a time, reducing after each
+  // step. We maintain the invariant that |0 <= r < divisor| and
+  // |q * divisor + r = n| where |n| is the portion of |numerator| incorporated
+  // so far.
+  //
+  // First, we short-circuit the loop: if we know |divisor| has at least
+  // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated
+  // without reductions. This significantly speeds up |RSA_check_key|. For
+  // simplicity, we round down to a whole number of words.
+  declassify_assert(divisor_min_bits <= BN_num_bits(divisor));
+  int initial_words = 0;
+  if (divisor_min_bits > 0) {
+    initial_words = (divisor_min_bits - 1) / BN_BITS2;
+    if (initial_words > numerator->width) {
+      initial_words = numerator->width;
+    }
+    OPENSSL_memcpy(r->d, numerator->d + numerator->width - initial_words,
+                   initial_words * sizeof(BN_ULONG));
+  }
+
+  for (int i = numerator->width - initial_words - 1; i >= 0; i--) {
+    for (int bit = BN_BITS2 - 1; bit >= 0; bit--) {
+      // Incorporate the next bit of the numerator, by computing
+      // r = 2*r or 2*r + 1. Note the result fits in one more word. We store the
+      // extra word in |carry|.
+      BN_ULONG carry = bn_add_words(r->d, r->d, r->d, divisor->width);
+      r->d[0] |= (numerator->d[i] >> bit) & 1;
+      // |r| was previously fully-reduced, so we know:
+      //      2*0 <= r <= 2*(divisor-1) + 1
+      //        0 <= r <= 2*divisor - 1 < 2*divisor.
+      // Thus |r| satisfies the preconditions for |bn_reduce_once_in_place|.
+      BN_ULONG subtracted = bn_reduce_once_in_place(r->d, carry, divisor->d,
+                                                    tmp->d, divisor->width);
+      // The corresponding bit of the quotient is set iff we needed to subtract.
+      q->d[i] |= (~subtracted & 1) << bit;
+    }
+  }
+
+  if ((quotient != NULL && !BN_copy(quotient, q)) ||
+      (remainder != NULL && !BN_copy(remainder, r))) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) {
+  BIGNUM *ret = BN_CTX_get(ctx);
+  if (ret == NULL ||
+      !bn_wexpand(ret, width)) {
+    return NULL;
+  }
+  ret->neg = 0;
+  ret->width = (int)width;
+  return ret;
+}
+
+// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on
+// error. This is so it may be used with low-level "words" functions. If
+// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope
+// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in
+// |width| words.
+static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width,
+                                         BN_CTX *ctx) {
+  if ((size_t)bn->width >= width) {
+    // Any excess words must be zero.
+    assert(bn_fits_in_words(bn, width));
+    return bn;
+  }
+  BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx);
+  if (ret == NULL ||
+      !BN_copy(ret, bn) ||
+      !bn_resize_words(ret, width)) {
+    return NULL;
+  }
+  return ret;
+}
+
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  if (!BN_add(r, a, b)) {
+    return 0;
+  }
+  return BN_nnmod(r, r, m, ctx);
+}
+
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_add_consttime(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
+    r->neg = 0;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  if (!BN_sub(r, a, b)) {
+    return 0;
+  }
+  return BN_nnmod(r, r, m, ctx);
+}
+
+int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  a = bn_resized_from_ctx(a, m->width, ctx);
+  b = bn_resized_from_ctx(b, m->width, ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = a != NULL && b != NULL && tmp != NULL &&
+           bn_wexpand(r, m->width);
+  if (ok) {
+    bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
+    r->width = m->width;
+    r->neg = 0;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_sub_consttime(r, a, b, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  BIGNUM *t;
+  int ret = 0;
+
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (t == NULL) {
+    goto err;
+  }
+
+  if (a == b) {
+    if (!BN_sqr(t, a, ctx)) {
+      goto err;
+    }
+  } else {
+    if (!BN_mul(t, a, b, ctx)) {
+      goto err;
+    }
+  }
+
+  if (!BN_nnmod(r, t, m, ctx)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+  if (!BN_sqr(r, a, ctx)) {
+    return 0;
+  }
+
+  // r->neg == 0,  thus we don't need BN_nnmod
+  return BN_mod(r, r, m, ctx);
+}
+
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                  BN_CTX *ctx) {
+  BIGNUM *abs_m = NULL;
+  int ret;
+
+  if (!BN_nnmod(r, a, m, ctx)) {
+    return 0;
+  }
+
+  if (m->neg) {
+    abs_m = BN_dup(m);
+    if (abs_m == NULL) {
+      return 0;
+    }
+    abs_m->neg = 0;
+  }
+
+  ret = bn_mod_lshift_consttime(r, r, n, (abs_m ? abs_m : m), ctx);
+
+  BN_free(abs_m);
+  return ret;
+}
+
+int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx) {
+  if (!BN_copy(r, a) ||
+      !bn_resize_words(r, m->width)) {
+    return 0;
+  }
+
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
+  int ok = tmp != NULL;
+  if (ok) {
+    for (int i = 0; i < n; i++) {
+      bn_mod_add_words(r->d, r->d, r->d, m->d, tmp->d, m->width);
+    }
+    r->neg = 0;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift_consttime(r, a, n, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+  if (!BN_lshift1(r, a)) {
+    return 0;
+  }
+
+  return BN_nnmod(r, r, m, ctx);
+}
+
+int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx) {
+  return bn_mod_add_consttime(r, a, a, m, ctx);
+}
+
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
+  BN_CTX *ctx = BN_CTX_new();
+  int ok = ctx != NULL &&
+           bn_mod_lshift1_consttime(r, a, m, ctx);
+  BN_CTX_free(ctx);
+  return ok;
+}
+
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
+  BN_ULONG ret = 0;
+  int i, j;
+
+  if (!w) {
+    // actually this an error (division by zero)
+    return (BN_ULONG) - 1;
+  }
+
+  if (a->width == 0) {
+    return 0;
+  }
+
+  // normalize input for |bn_div_rem_words|.
+  j = BN_BITS2 - BN_num_bits_word(w);
+  w <<= j;
+  if (!BN_lshift(a, a, j)) {
+    return (BN_ULONG) - 1;
+  }
+
+  for (i = a->width - 1; i >= 0; i--) {
+    BN_ULONG l = a->d[i];
+    BN_ULONG d;
+    BN_ULONG unused_rem;
+    bn_div_rem_words(&d, &unused_rem, ret, l, w);
+    ret = l - (d * w);
+    a->d[i] = d;
+  }
+
+  bn_set_minimal_width(a);
+  ret >>= j;
+  return ret;
+}
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
+#ifndef BN_CAN_DIVIDE_ULLONG
+  BN_ULONG ret = 0;
+#else
+  BN_ULLONG ret = 0;
+#endif
+  int i;
+
+  if (w == 0) {
+    return (BN_ULONG) -1;
+  }
+
+#ifndef BN_CAN_DIVIDE_ULLONG
+  // If |w| is too long and we don't have |BN_ULLONG| division then we need to
+  // fall back to using |BN_div_word|.
+  if (w > ((BN_ULONG)1 << BN_BITS4)) {
+    BIGNUM *tmp = BN_dup(a);
+    if (tmp == NULL) {
+      return (BN_ULONG)-1;
+    }
+    ret = BN_div_word(tmp, w);
+    BN_free(tmp);
+    return ret;
+  }
+#endif
+
+  for (i = a->width - 1; i >= 0; i--) {
+#ifndef BN_CAN_DIVIDE_ULLONG
+    ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
+    ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
+#else
+    ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w);
+#endif
+  }
+  return (BN_ULONG)ret;
+}
+
+int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
+  if (e == 0 || a->width == 0) {
+    BN_zero(r);
+    return 1;
+  }
+
+  size_t num_words = 1 + ((e - 1) / BN_BITS2);
+
+  // If |a| definitely has less than |e| bits, just BN_copy.
+  if ((size_t) a->width < num_words) {
+    return BN_copy(r, a) != NULL;
+  }
+
+  // Otherwise, first make sure we have enough space in |r|.
+  // Note that this will fail if num_words > INT_MAX.
+  if (!bn_wexpand(r, num_words)) {
+    return 0;
+  }
+
+  // Copy the content of |a| into |r|.
+  OPENSSL_memcpy(r->d, a->d, num_words * sizeof(BN_ULONG));
+
+  // If |e| isn't word-aligned, we have to mask off some of our bits.
+  size_t top_word_exponent = e % (sizeof(BN_ULONG) * 8);
+  if (top_word_exponent != 0) {
+    r->d[num_words - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
+  }
+
+  // Fill in the remaining fields of |r|.
+  r->neg = a->neg;
+  r->width = (int) num_words;
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int BN_nnmod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
+  if (!BN_mod_pow2(r, a, e)) {
+    return 0;
+  }
+
+  // If the returned value was non-negative, we're done.
+  if (BN_is_zero(r) || !r->neg) {
+    return 1;
+  }
+
+  size_t num_words = 1 + (e - 1) / BN_BITS2;
+
+  // Expand |r| to the size of our modulus.
+  if (!bn_wexpand(r, num_words)) {
+    return 0;
+  }
+
+  // Clear the upper words of |r|.
+  OPENSSL_memset(&r->d[r->width], 0, (num_words - r->width) * BN_BYTES);
+
+  // Set parameters of |r|.
+  r->neg = 0;
+  r->width = (int) num_words;
+
+  // Now, invert every word. The idea here is that we want to compute 2^e-|x|,
+  // which is actually equivalent to the twos-complement representation of |x|
+  // in |e| bits, which is -x = ~x + 1.
+  for (int i = 0; i < r->width; i++) {
+    r->d[i] = ~r->d[i];
+  }
+
+  // If our exponent doesn't span the top word, we have to mask the rest.
+  size_t top_word_exponent = e % BN_BITS2;
+  if (top_word_exponent != 0) {
+    r->d[r->width - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
+  }
+
+  // Keep the minimal-width invariant for |BIGNUM|.
+  bn_set_minimal_width(r);
+
+  // Finally, add one, for the reason described above.
+  return BN_add(r, r, BN_value_one());
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/div_extra.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/div_extra.c
@@ -0,0 +1,76 @@
+// Copyright (c) 2018, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+// The following functions use a Barrett reduction variant to avoid leaking the
+// numerator. See http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+// We use 32-bit numerator and 16-bit divisor for simplicity. This allows
+// computing |m| and |q| without architecture-specific code.
+
+// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see
+// reference). For proof of correctness in Coq, see
+// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v
+// Note the Coq version of |mod_u16| additionally includes the computation of
+// |p| and |m| from |bn_mod_u16_consttime| below.
+static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) {
+  // Compute floor(n/d) per steps 3 through 5.
+  uint32_t q = ((uint64_t)m * n) >> 32;
+  // Note there is a typo in the reference. We right-shift by one, not two.
+  uint32_t t = ((n - q) >> 1) + q;
+  t = t >> (p - 1);
+
+  // Multiply and subtract to get the remainder.
+  n -= d * t;
+  declassify_assert(n < d);
+  return n;
+}
+
+// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the
+// "magic numbers" for |d| (see reference).
+static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d,
+                                      uint32_t p, uint32_t m) {
+  // Incorporate |a| in two 16-bit chunks.
+  uint32_t t = r;
+  t <<= 16;
+  t |= a >> 16;
+  t = mod_u16(t, d, p, m);
+
+  t <<= 16;
+  t |= a & 0xffff;
+  t = mod_u16(t, d, p, m);
+  return t;
+}
+
+uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) {
+  if (d <= 1) {
+    return 0;
+  }
+
+  // Compute the "magic numbers" for |d|. See steps 1 and 2.
+  // This computes p = ceil(log_2(d)).
+  uint32_t p = BN_num_bits_word(d - 1);
+  // This operation is not constant-time, but |p| and |d| are public values.
+  // Note that |p| is at most 16, so the computation fits in |uint64_t|.
+  assert(p <= 16);
+  uint32_t m = (uint32_t)(((UINT64_C(1) << (32 + p)) + d - 1) / d);
+
+  uint16_t ret = 0;
+  for (int i = bn->width - 1; i >= 0; i--) {
+#if BN_BITS2 == 32
+    ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m);
+#elif BN_BITS2 == 64
+    ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m);
+    ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m);
+#else
+#error "Unknown BN_ULONG size"
+#endif
+  }
+  return ret;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/exponentiation.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/exponentiation.c
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/gcd.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/gcd.c
@@ -0,0 +1,293 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+// Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
+                       const BIGNUM *n, BN_CTX *ctx) {
+  *out_no_inverse = 0;
+
+  if (!BN_is_odd(n)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+
+  if (BN_is_negative(a) || BN_cmp(a, n) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+
+  BIGNUM *A, *B, *X, *Y;
+  int ret = 0;
+  int sign;
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  B = BN_CTX_get(ctx);
+  X = BN_CTX_get(ctx);
+  Y = BN_CTX_get(ctx);
+  if (Y == NULL) {
+    goto err;
+  }
+
+  BIGNUM *R = out;
+
+  BN_zero(Y);
+  if (!BN_one(X) || BN_copy(B, a) == NULL || BN_copy(A, n) == NULL) {
+    goto err;
+  }
+  A->neg = 0;
+  sign = -1;
+  // From  B = a mod |n|,  A = |n|  it follows that
+  //
+  //      0 <= B < A,
+  //     -sign*X*a  ==  B   (mod |n|),
+  //      sign*Y*a  ==  A   (mod |n|).
+
+  // Binary inversion algorithm; requires odd modulus. This is faster than the
+  // general algorithm if the modulus is sufficiently small (about 400 .. 500
+  // bits on 32-bit systems, but much more on 64-bit systems)
+  int shift;
+
+  while (!BN_is_zero(B)) {
+    //      0 < B < |n|,
+    //      0 < A <= |n|,
+    // (1) -sign*X*a  ==  B   (mod |n|),
+    // (2)  sign*Y*a  ==  A   (mod |n|)
+
+    // Now divide  B  by the maximum possible power of two in the integers,
+    // and divide  X  by the same value mod |n|.
+    // When we're done, (1) still holds.
+    shift = 0;
+    while (!BN_is_bit_set(B, shift)) {
+      // note that 0 < B
+      shift++;
+
+      if (BN_is_odd(X)) {
+        if (!BN_uadd(X, X, n)) {
+          goto err;
+        }
+      }
+      // now X is even, so we can easily divide it by two
+      if (!BN_rshift1(X, X)) {
+        goto err;
+      }
+    }
+    if (shift > 0) {
+      if (!BN_rshift(B, B, shift)) {
+        goto err;
+      }
+    }
+
+    // Same for A and Y. Afterwards, (2) still holds.
+    shift = 0;
+    while (!BN_is_bit_set(A, shift)) {
+      // note that 0 < A
+      shift++;
+
+      if (BN_is_odd(Y)) {
+        if (!BN_uadd(Y, Y, n)) {
+          goto err;
+        }
+      }
+      // now Y is even
+      if (!BN_rshift1(Y, Y)) {
+        goto err;
+      }
+    }
+    if (shift > 0) {
+      if (!BN_rshift(A, A, shift)) {
+        goto err;
+      }
+    }
+
+    // We still have (1) and (2).
+    // Both  A  and  B  are odd.
+    // The following computations ensure that
+    //
+    //     0 <= B < |n|,
+    //      0 < A < |n|,
+    // (1) -sign*X*a  ==  B   (mod |n|),
+    // (2)  sign*Y*a  ==  A   (mod |n|),
+    //
+    // and that either  A  or  B  is even in the next iteration.
+    if (BN_ucmp(B, A) >= 0) {
+      // -sign*(X + Y)*a == B - A  (mod |n|)
+      if (!BN_uadd(X, X, Y)) {
+        goto err;
+      }
+      // NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+      // actually makes the algorithm slower
+      if (!BN_usub(B, B, A)) {
+        goto err;
+      }
+    } else {
+      //  sign*(X + Y)*a == A - B  (mod |n|)
+      if (!BN_uadd(Y, Y, X)) {
+        goto err;
+      }
+      // as above, BN_mod_add_quick(Y, Y, X, n) would slow things down
+      if (!BN_usub(A, A, B)) {
+        goto err;
+      }
+    }
+  }
+
+  if (!BN_is_one(A)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    goto err;
+  }
+
+  // The while loop (Euclid's algorithm) ends when
+  //      A == gcd(a,n);
+  // we have
+  //       sign*Y*a  ==  A  (mod |n|),
+  // where  Y  is non-negative.
+
+  if (sign < 0) {
+    if (!BN_sub(Y, n, Y)) {
+      goto err;
+    }
+  }
+  // Now  Y*a  ==  A  (mod |n|).
+
+  // Y*a == 1  (mod |n|)
+  if (Y->neg || BN_ucmp(Y, n) >= 0) {
+    if (!BN_nnmod(Y, Y, n, ctx)) {
+      goto err;
+    }
+  }
+  if (!BN_copy(R, Y)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
+                       BN_CTX *ctx) {
+  BIGNUM *new_out = NULL;
+  if (out == NULL) {
+    new_out = BN_new();
+    if (new_out == NULL) {
+      return NULL;
+    }
+    out = new_out;
+  }
+
+  int ok = 0;
+  BIGNUM *a_reduced = NULL;
+  if (a->neg || BN_ucmp(a, n) >= 0) {
+    a_reduced = BN_dup(a);
+    if (a_reduced == NULL) {
+      goto err;
+    }
+    if (!BN_nnmod(a_reduced, a_reduced, n, ctx)) {
+      goto err;
+    }
+    a = a_reduced;
+  }
+
+  int no_inverse;
+  if (!BN_is_odd(n)) {
+    if (!bn_mod_inverse_consttime(out, &no_inverse, a, n, ctx)) {
+      goto err;
+    }
+  } else if (!BN_mod_inverse_odd(out, &no_inverse, a, n, ctx)) {
+    goto err;
+  }
+
+  ok = 1;
+
+err:
+  if (!ok) {
+    BN_free(new_out);
+    out = NULL;
+  }
+  BN_free(a_reduced);
+  return out;
+}
+
+int BN_mod_inverse_blinded(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
+                           const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  *out_no_inverse = 0;
+
+  // |a| is secret, but it is required to be in range, so these comparisons may
+  // be leaked.
+  if (BN_is_negative(a) ||
+      constant_time_declassify_int(BN_cmp(a, &mont->N) >= 0)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+
+  int ret = 0;
+  BIGNUM blinding_factor;
+  BN_init(&blinding_factor);
+
+  // |BN_mod_inverse_odd| is leaky, so generate a secret blinding factor and
+  // blind |a|. This works because (ar)^-1 * r = a^-1, supposing r is
+  // invertible. If r is not invertible, this function will fail. However, we
+  // only use this in RSA, where stumbling on an uninvertible element means
+  // stumbling on the key's factorization. That is, if this function fails, the
+  // RSA key was not actually a product of two large primes.
+  //
+  // TODO(crbug.com/boringssl/677): When the PRNG output is marked secret by
+  // default, the explicit |bn_secret| call can be removed.
+  if (!BN_rand_range_ex(&blinding_factor, 1, &mont->N)) {
+    goto err;
+  }
+  bn_secret(&blinding_factor);
+  if (!BN_mod_mul_montgomery(out, &blinding_factor, a, mont, ctx)) {
+    goto err;
+  }
+
+  // Once blinded, |out| is no longer secret, so it may be passed to a leaky
+  // mod inverse function. Note |blinding_factor| is secret, so |out| will be
+  // secret again after multiplying.
+  bn_declassify(out);
+  if (!BN_mod_inverse_odd(out, out_no_inverse, out, &mont->N, ctx) ||
+      !BN_mod_mul_montgomery(out, &blinding_factor, out, mont, ctx)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_free(&blinding_factor);
+  return ret;
+}
+
+int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                         BN_CTX *ctx, const BN_MONT_CTX *mont_p) {
+  BN_CTX_start(ctx);
+  BIGNUM *p_minus_2 = BN_CTX_get(ctx);
+  int ok = p_minus_2 != NULL &&
+           BN_copy(p_minus_2, p) &&
+           BN_sub_word(p_minus_2, 2) &&
+           BN_mod_exp_mont(out, a, p_minus_2, p, ctx, mont_p);
+  BN_CTX_end(ctx);
+  return ok;
+}
+
+int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                                BN_CTX *ctx, const BN_MONT_CTX *mont_p) {
+  BN_CTX_start(ctx);
+  BIGNUM *p_minus_2 = BN_CTX_get(ctx);
+  int ok = p_minus_2 != NULL &&
+           BN_copy(p_minus_2, p) &&
+           BN_sub_word(p_minus_2, 2) &&
+           BN_mod_exp_mont_consttime(out, a, p_minus_2, p, ctx, mont_p);
+  BN_CTX_end(ctx);
+  return ok;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/gcd_extra.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/gcd_extra.c
@@ -0,0 +1,320 @@
+// Copyright (c) 2018, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); }
+
+static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp,
+                                size_t num) {
+  bn_rshift1_words(tmp, a, num);
+  bn_select_words(a, mask, tmp, a, num);
+}
+
+static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry,
+                                      BN_ULONG mask, BN_ULONG *tmp,
+                                      size_t num) {
+  maybe_rshift1_words(a, mask, tmp, num);
+  if (num != 0) {
+    carry &= mask;
+    a[num - 1] |= carry << (BN_BITS2-1);
+  }
+}
+
+static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b,
+                                BN_ULONG *tmp, size_t num) {
+  BN_ULONG carry = bn_add_words(tmp, a, b, num);
+  bn_select_words(a, mask, tmp, a, num);
+  return carry & mask;
+}
+
+static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
+                            const BIGNUM *y, BN_CTX *ctx) {
+  size_t width = x->width > y->width ? x->width : y->width;
+  if (width == 0) {
+    *out_shift = 0;
+    BN_zero(r);
+    return 1;
+  }
+
+  // This is a constant-time implementation of Stein's algorithm (binary GCD).
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *u = BN_CTX_get(ctx);
+  BIGNUM *v = BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (u == NULL || v == NULL || tmp == NULL ||
+      !BN_copy(u, x) ||
+      !BN_copy(v, y) ||
+      !bn_resize_words(u, width) ||
+      !bn_resize_words(v, width) ||
+      !bn_resize_words(tmp, width)) {
+    goto err;
+  }
+
+  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
+  // most the combined bit width of inputs for at least one value to be zero.
+  unsigned x_bits = x->width * BN_BITS2, y_bits = y->width * BN_BITS2;
+  unsigned num_iters = x_bits + y_bits;
+  if (num_iters < x_bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    goto err;
+  }
+
+  unsigned shift = 0;
+  for (unsigned i = 0; i < num_iters; i++) {
+    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
+
+    // If both |u| and |v| are odd, subtract the smaller from the larger.
+    BN_ULONG u_less_than_v =
+        (BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width);
+    bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width);
+    bn_sub_words(tmp->d, v->d, u->d, width);
+    bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width);
+
+    // At least one of |u| and |v| is now even.
+    BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]);
+    BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]);
+    declassify_assert(!(u_is_odd & v_is_odd));
+
+    // If both are even, the final GCD gains a factor of two.
+    shift += 1 & (~u_is_odd & ~v_is_odd);
+
+    // Halve any which are even.
+    maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width);
+    maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width);
+  }
+
+  // One of |u| or |v| is zero at this point. The algorithm usually makes |u|
+  // zero, unless |y| was already zero on input. Fix this by combining the
+  // values.
+  declassify_assert(BN_is_zero(u) | BN_is_zero(v));
+  for (size_t i = 0; i < width; i++) {
+    v->d[i] |= u->d[i];
+  }
+
+  *out_shift = shift;
+  ret = bn_set_words(r, v->d, width);
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) {
+  unsigned shift;
+  return bn_gcd_consttime(r, &shift, x, y, ctx) &&
+         BN_lshift(r, r, shift);
+}
+
+int bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x,
+                           const BIGNUM *y, BN_CTX *ctx) {
+  int ret = 0;
+  BN_CTX_start(ctx);
+  unsigned shift;
+  BIGNUM *gcd = BN_CTX_get(ctx);
+  if (gcd == NULL ||
+      !bn_gcd_consttime(gcd, &shift, x, y, ctx)) {
+    goto err;
+  }
+
+  // Check that 2^|shift| * |gcd| is one.
+  if (gcd->width == 0) {
+    *out_relatively_prime = 0;
+  } else {
+    BN_ULONG mask = shift | (gcd->d[0] ^ 1);
+    for (int i = 1; i < gcd->width; i++) {
+      mask |= gcd->d[i];
+    }
+    *out_relatively_prime = mask == 0;
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  BN_CTX_start(ctx);
+  unsigned shift;
+  BIGNUM *gcd = BN_CTX_get(ctx);
+  int ret = gcd != NULL &&  //
+            bn_mul_consttime(r, a, b, ctx) &&
+            bn_gcd_consttime(gcd, &shift, a, b, ctx) &&
+            // |gcd| has a secret bit width.
+            bn_div_consttime(r, NULL, r, gcd, /*divisor_min_bits=*/0, ctx) &&
+            bn_rshift_secret_shift(r, r, shift, ctx);
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
+                             const BIGNUM *n, BN_CTX *ctx) {
+  *out_no_inverse = 0;
+  if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
+    return 0;
+  }
+  if (BN_is_zero(a)) {
+    if (BN_is_one(n)) {
+      BN_zero(r);
+      return 1;
+    }
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This is a constant-time implementation of the extended binary GCD
+  // algorithm. It is adapted from the Handbook of Applied Cryptography, section
+  // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid
+  // negative numbers.
+  //
+  // For more details and proof of correctness, see
+  // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step|
+  // and |mod_inverse_consttime| for the algorithm in Gallina and see
+  // |mod_inverse_consttime_spec| for the correctness result.
+
+  if (!BN_is_odd(a) && !BN_is_odd(n)) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    return 0;
+  }
+
+  // This function exists to compute the RSA private exponent, where |a| is one
+  // word. We'll thus use |a_width| when available.
+  size_t n_width = n->width, a_width = a->width;
+  if (a_width > n_width) {
+    a_width = n_width;
+  }
+
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *u = BN_CTX_get(ctx);
+  BIGNUM *v = BN_CTX_get(ctx);
+  BIGNUM *A = BN_CTX_get(ctx);
+  BIGNUM *B = BN_CTX_get(ctx);
+  BIGNUM *C = BN_CTX_get(ctx);
+  BIGNUM *D = BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  BIGNUM *tmp2 = BN_CTX_get(ctx);
+  if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL ||
+      D == NULL || tmp == NULL || tmp2 == NULL ||
+      !BN_copy(u, a) ||
+      !BN_copy(v, n) ||
+      !BN_one(A) ||
+      !BN_one(D) ||
+      // For convenience, size |u| and |v| equivalently.
+      !bn_resize_words(u, n_width) ||
+      !bn_resize_words(v, n_width) ||
+      // |A| and |C| are bounded by |m|.
+      !bn_resize_words(A, n_width) ||
+      !bn_resize_words(C, n_width) ||
+      // |B| and |D| are bounded by |a|.
+      !bn_resize_words(B, a_width) ||
+      !bn_resize_words(D, a_width) ||
+      // |tmp| and |tmp2| may be used at either size.
+      !bn_resize_words(tmp, n_width) ||
+      !bn_resize_words(tmp2, n_width)) {
+    goto err;
+  }
+
+  // Each loop iteration halves at least one of |u| and |v|. Thus we need at
+  // most the combined bit width of inputs for at least one value to be zero.
+  // |a_bits| and |n_bits| cannot overflow because |bn_wexpand| ensures bit
+  // counts fit in even |int|.
+  size_t a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2;
+  size_t num_iters = a_bits + n_bits;
+  if (num_iters < a_bits) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    goto err;
+  }
+
+  // Before and after each loop iteration, the following hold:
+  //
+  //   u = A*a - B*n
+  //   v = D*n - C*a
+  //   0 < u <= a
+  //   0 <= v <= n
+  //   0 <= A < n
+  //   0 <= B <= a
+  //   0 <= C < n
+  //   0 <= D <= a
+  //
+  // After each loop iteration, u and v only get smaller, and at least one of
+  // them shrinks by at least a factor of two.
+  for (size_t i = 0; i < num_iters; i++) {
+    BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
+
+    // If both |u| and |v| are odd, subtract the smaller from the larger.
+    BN_ULONG v_less_than_u =
+        (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width);
+    bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width);
+    bn_sub_words(tmp->d, u->d, v->d, n_width);
+    bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width);
+
+    // If we updated one of the values, update the corresponding coefficient.
+    BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width);
+    carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width);
+    bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width);
+    bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width);
+
+    bn_add_words(tmp->d, B->d, D->d, a_width);
+    bn_sub_words(tmp2->d, tmp->d, a->d, a_width);
+    bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width);
+    bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width);
+    bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width);
+
+    // Our loop invariants hold at this point. Additionally, exactly one of |u|
+    // and |v| is now even.
+    BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
+    BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
+    declassify_assert(u_is_even != v_is_even);
+
+    // Halve the even one and adjust the corresponding coefficient.
+    maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
+    BN_ULONG A_or_B_is_odd =
+        word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]);
+    BN_ULONG A_carry =
+        maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width);
+    BN_ULONG B_carry =
+        maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width);
+
+    maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width);
+    BN_ULONG C_or_D_is_odd =
+        word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]);
+    BN_ULONG C_carry =
+        maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width);
+    BN_ULONG D_carry =
+        maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width);
+    maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width);
+    maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
+  }
+
+  declassify_assert(BN_is_zero(v));
+  // While the inputs and output are secret, this function considers whether the
+  // input was invertible to be public. It is used as part of RSA key
+  // generation, where inputs are chosen to already be invertible.
+  if (constant_time_declassify_int(!BN_is_one(u))) {
+    *out_no_inverse = 1;
+    OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
+    goto err;
+  }
+
+  ret = BN_copy(r, A) != NULL;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/generic.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/generic.c
@@ -0,0 +1,571 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
+// See asm/bn-586.pl.
+#define BN_ADD_ASM
+#define BN_MUL_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
+// See asm/x86_64-gcc.c
+#define BN_ADD_ASM
+#define BN_MUL_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+// See asm/bn-armv8.pl.
+#define BN_ADD_ASM
+#endif
+
+#if !defined(BN_MUL_ASM)
+
+#ifdef BN_ULLONG
+#define mul_add(r, a, w, c)               \
+  do {                                    \
+    BN_ULLONG t;                          \
+    t = (BN_ULLONG)(w) * (a) + (r) + (c); \
+    (r) = Lw(t);                          \
+    (c) = Hw(t);                          \
+  } while (0)
+
+#define mul(r, a, w, c)             \
+  do {                              \
+    BN_ULLONG t;                    \
+    t = (BN_ULLONG)(w) * (a) + (c); \
+    (r) = Lw(t);                    \
+    (c) = Hw(t);                    \
+  } while (0)
+
+#define sqr(r0, r1, a)        \
+  do {                        \
+    BN_ULLONG t;              \
+    t = (BN_ULLONG)(a) * (a); \
+    (r0) = Lw(t);             \
+    (r1) = Hw(t);             \
+  } while (0)
+
+#else
+
+#define mul_add(r, a, w, c)             \
+  do {                                  \
+    BN_ULONG high, low, ret, tmp = (a); \
+    ret = (r);                          \
+    BN_UMULT_LOHI(low, high, w, tmp);   \
+    ret += (c);                         \
+    (c) = (ret < (c)) ? 1 : 0;          \
+    (c) += high;                        \
+    ret += low;                         \
+    (c) += (ret < low) ? 1 : 0;         \
+    (r) = ret;                          \
+  } while (0)
+
+#define mul(r, a, w, c)                \
+  do {                                 \
+    BN_ULONG high, low, ret, ta = (a); \
+    BN_UMULT_LOHI(low, high, w, ta);   \
+    ret = low + (c);                   \
+    (c) = high;                        \
+    (c) += (ret < low) ? 1 : 0;        \
+    (r) = ret;                         \
+  } while (0)
+
+#define sqr(r0, r1, a)               \
+  do {                               \
+    BN_ULONG tmp = (a);              \
+    BN_UMULT_LOHI(r0, r1, tmp, tmp); \
+  } while (0)
+
+#endif  // !BN_ULLONG
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                          BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], w, c1);
+    mul_add(rp[1], ap[1], w, c1);
+    mul_add(rp[2], ap[2], w, c1);
+    mul_add(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+
+  while (num) {
+    mul_add(rp[0], ap[0], w, c1);
+    ap++;
+    rp++;
+    num--;
+  }
+
+  return c1;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                      BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], w, c1);
+    mul(rp[1], ap[1], w, c1);
+    mul(rp[2], ap[2], w, c1);
+    mul(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  while (num) {
+    mul(rp[0], ap[0], w, c1);
+    ap++;
+    rp++;
+    num--;
+  }
+  return c1;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
+  if (n == 0) {
+    return;
+  }
+
+  while (n & ~3) {
+    sqr(r[0], r[1], a[0]);
+    sqr(r[2], r[3], a[1]);
+    sqr(r[4], r[5], a[2]);
+    sqr(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  while (n) {
+    sqr(r[0], r[1], a[0]);
+    a++;
+    r += 2;
+    n--;
+  }
+}
+
+// mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0)
+// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
+// sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0)
+// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
+
+#ifdef BN_ULLONG
+
+// Keep in mind that additions to multiplication result can not overflow,
+// because its high half cannot be all-ones.
+#define mul_add_c(a, b, c0, c1, c2)     \
+  do {                                  \
+    BN_ULONG hi;                        \
+    BN_ULLONG t = (BN_ULLONG)(a) * (b); \
+    t += (c0); /* no carry */           \
+    (c0) = (BN_ULONG)Lw(t);             \
+    hi = (BN_ULONG)Hw(t);               \
+    (c1) += (hi);                       \
+    (c2) += (c1) < hi;                  \
+  } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2)        \
+  do {                                      \
+    BN_ULONG hi;                            \
+    BN_ULLONG t = (BN_ULLONG)(a) * (b);     \
+    BN_ULLONG tt = t + (c0); /* no carry */ \
+    (c0) = (BN_ULONG)Lw(tt);                \
+    hi = (BN_ULONG)Hw(tt);                  \
+    (c1) += hi;                             \
+    (c2) += (c1) < hi;                      \
+    t += (c0); /* no carry */               \
+    (c0) = (BN_ULONG)Lw(t);                 \
+    hi = (BN_ULONG)Hw(t);                   \
+    (c1) += hi;                             \
+    (c2) += (c1) < hi;                      \
+  } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2)           \
+  do {                                        \
+    BN_ULONG hi;                              \
+    BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \
+    t += (c0); /* no carry */                 \
+    (c0) = (BN_ULONG)Lw(t);                   \
+    hi = (BN_ULONG)Hw(t);                     \
+    (c1) += hi;                               \
+    (c2) += (c1) < hi;                        \
+  } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#else
+
+// Keep in mind that additions to hi can not overflow, because the high word of
+// a multiplication result cannot be all-ones.
+#define mul_add_c(a, b, c0, c1, c2) \
+  do {                              \
+    BN_ULONG ta = (a), tb = (b);    \
+    BN_ULONG lo, hi;                \
+    BN_UMULT_LOHI(lo, hi, ta, tb);  \
+    (c0) += lo;                     \
+    hi += ((c0) < lo) ? 1 : 0;      \
+    (c1) += hi;                     \
+    (c2) += ((c1) < hi) ? 1 : 0;    \
+  } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+  do {                               \
+    BN_ULONG ta = (a), tb = (b);     \
+    BN_ULONG lo, hi, tt;             \
+    BN_UMULT_LOHI(lo, hi, ta, tb);   \
+    (c0) += lo;                      \
+    tt = hi + (((c0) < lo) ? 1 : 0); \
+    (c1) += tt;                      \
+    (c2) += ((c1) < tt) ? 1 : 0;     \
+    (c0) += lo;                      \
+    hi += (c0 < lo) ? 1 : 0;         \
+    (c1) += hi;                      \
+    (c2) += ((c1) < hi) ? 1 : 0;     \
+  } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+  do {                              \
+    BN_ULONG ta = (a)[i];           \
+    BN_ULONG lo, hi;                \
+    BN_UMULT_LOHI(lo, hi, ta, ta);  \
+    (c0) += lo;                     \
+    hi += (c0 < lo) ? 1 : 0;        \
+    (c1) += hi;                     \
+    (c2) += ((c1) < hi) ? 1 : 0;    \
+  } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#endif  // !BN_ULLONG
+
+void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[4], b[0], c2, c3, c1);
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  mul_add_c(a[0], b[4], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[0], b[5], c3, c1, c2);
+  mul_add_c(a[1], b[4], c3, c1, c2);
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  mul_add_c(a[4], b[1], c3, c1, c2);
+  mul_add_c(a[5], b[0], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[6], b[0], c1, c2, c3);
+  mul_add_c(a[5], b[1], c1, c2, c3);
+  mul_add_c(a[4], b[2], c1, c2, c3);
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  mul_add_c(a[2], b[4], c1, c2, c3);
+  mul_add_c(a[1], b[5], c1, c2, c3);
+  mul_add_c(a[0], b[6], c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[7], c2, c3, c1);
+  mul_add_c(a[1], b[6], c2, c3, c1);
+  mul_add_c(a[2], b[5], c2, c3, c1);
+  mul_add_c(a[3], b[4], c2, c3, c1);
+  mul_add_c(a[4], b[3], c2, c3, c1);
+  mul_add_c(a[5], b[2], c2, c3, c1);
+  mul_add_c(a[6], b[1], c2, c3, c1);
+  mul_add_c(a[7], b[0], c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[1], c3, c1, c2);
+  mul_add_c(a[6], b[2], c3, c1, c2);
+  mul_add_c(a[5], b[3], c3, c1, c2);
+  mul_add_c(a[4], b[4], c3, c1, c2);
+  mul_add_c(a[3], b[5], c3, c1, c2);
+  mul_add_c(a[2], b[6], c3, c1, c2);
+  mul_add_c(a[1], b[7], c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  mul_add_c(a[2], b[7], c1, c2, c3);
+  mul_add_c(a[3], b[6], c1, c2, c3);
+  mul_add_c(a[4], b[5], c1, c2, c3);
+  mul_add_c(a[5], b[4], c1, c2, c3);
+  mul_add_c(a[6], b[3], c1, c2, c3);
+  mul_add_c(a[7], b[2], c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  mul_add_c(a[7], b[3], c2, c3, c1);
+  mul_add_c(a[6], b[4], c2, c3, c1);
+  mul_add_c(a[5], b[5], c2, c3, c1);
+  mul_add_c(a[4], b[6], c2, c3, c1);
+  mul_add_c(a[3], b[7], c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  mul_add_c(a[4], b[7], c3, c1, c2);
+  mul_add_c(a[5], b[6], c3, c1, c2);
+  mul_add_c(a[6], b[5], c3, c1, c2);
+  mul_add_c(a[7], b[4], c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  mul_add_c(a[7], b[5], c1, c2, c3);
+  mul_add_c(a[6], b[6], c1, c2, c3);
+  mul_add_c(a[5], b[7], c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  mul_add_c(a[6], b[7], c2, c3, c1);
+  mul_add_c(a[7], b[6], c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[7], c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  sqr_add_c2(a, 4, 0, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 5, 0, c3, c1, c2);
+  sqr_add_c2(a, 4, 1, c3, c1, c2);
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  sqr_add_c2(a, 4, 2, c1, c2, c3);
+  sqr_add_c2(a, 5, 1, c1, c2, c3);
+  sqr_add_c2(a, 6, 0, c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 0, c2, c3, c1);
+  sqr_add_c2(a, 6, 1, c2, c3, c1);
+  sqr_add_c2(a, 5, 2, c2, c3, c1);
+  sqr_add_c2(a, 4, 3, c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  sqr_add_c(a, 4, c3, c1, c2);
+  sqr_add_c2(a, 5, 3, c3, c1, c2);
+  sqr_add_c2(a, 6, 2, c3, c1, c2);
+  sqr_add_c2(a, 7, 1, c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 7, 2, c1, c2, c3);
+  sqr_add_c2(a, 6, 3, c1, c2, c3);
+  sqr_add_c2(a, 5, 4, c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  sqr_add_c(a, 5, c2, c3, c1);
+  sqr_add_c2(a, 6, 4, c2, c3, c1);
+  sqr_add_c2(a, 7, 3, c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 7, 4, c3, c1, c2);
+  sqr_add_c2(a, 6, 5, c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  sqr_add_c(a, 6, c1, c2, c3);
+  sqr_add_c2(a, 7, 5, c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 6, c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  sqr_add_c(a, 7, c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+#undef mul_add
+#undef mul
+#undef sqr
+#undef mul_add_c
+#undef mul_add_c2
+#undef sqr_add_c
+#undef sqr_add_c2
+
+#endif  // !BN_MUL_ASM
+
+#if !defined(BN_ADD_ASM)
+
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      size_t n) {
+  if (n == 0) {
+    return 0;
+  }
+
+  BN_ULONG carry = 0;
+  while (n & ~3) {
+    r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
+    r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry);
+    r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry);
+    r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry);
+    a += 4;
+    b += 4;
+    r += 4;
+    n -= 4;
+  }
+  while (n) {
+    r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
+    a++;
+    b++;
+    r++;
+    n--;
+  }
+  return carry;
+}
+
+BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      size_t n) {
+  if (n == 0) {
+    return (BN_ULONG)0;
+  }
+
+  BN_ULONG borrow = 0;
+  while (n & ~3) {
+    r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
+    r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow);
+    r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow);
+    r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow);
+    a += 4;
+    b += 4;
+    r += 4;
+    n -= 4;
+  }
+  while (n) {
+    r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
+    a++;
+    b++;
+    r++;
+    n--;
+  }
+  return borrow;
+}
+
+#endif  // !BN_ADD_ASM
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/internal.h
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/internal.h
@@ -0,0 +1,736 @@
+// Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+// Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+// Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+//
+// The binary polynomial arithmetic software is originally written by
+// Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+// Laboratories.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENSSL_HEADER_BN_INTERNAL_H
+#define OPENSSL_HEADER_BN_INTERNAL_H
+
+#include <openssl/bn.h>
+#include <openssl/rand.h>
+
+#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
+OPENSSL_MSVC_PRAGMA(warning(push, 3))
+#include <intrin.h>
+OPENSSL_MSVC_PRAGMA(warning(pop))
+#pragma intrinsic(__umulh, _umul128)
+#endif
+
+#include "../../internal.h"
+#include "../cpucap/internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(OPENSSL_64_BIT)
+
+#if defined(BORINGSSL_HAS_UINT128)
+// MSVC doesn't support two-word integers on 64-bit.
+#define BN_ULLONG uint128_t
+#if defined(BORINGSSL_CAN_DIVIDE_UINT128)
+#define BN_CAN_DIVIDE_ULLONG
+#endif
+#endif
+
+#define BN_BITS2 64
+#define BN_BITS2_LG 6
+#define BN_BYTES 8
+#define BN_BITS4 32
+#define BN_MASK2 (0xffffffffffffffffUL)
+#define BN_MASK2l (0xffffffffUL)
+#define BN_MASK2h (0xffffffff00000000UL)
+#define BN_MASK2h1 (0xffffffff80000000UL)
+#define BN_MONT_CTX_N0_LIMBS 1
+#define BN_DEC_CONV (10000000000000000000UL)
+#define BN_DEC_NUM 19
+#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
+
+#elif defined(OPENSSL_32_BIT)
+
+#define BN_ULLONG uint64_t
+#define BN_CAN_DIVIDE_ULLONG
+#define BN_BITS2 32
+#define BN_BITS2_LG 5
+#define BN_BYTES 4
+#define BN_BITS4 16
+#define BN_MASK2 (0xffffffffUL)
+#define BN_MASK2l (0xffffUL)
+#define BN_MASK2h1 (0xffff8000UL)
+#define BN_MASK2h (0xffff0000UL)
+// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
+// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
+// needs to be two words long. Only certain 32-bit platforms actually make use
+// of n0[1] and shorter R value would suffice for the others. However,
+// currently only the assembly files know which is which.
+#define BN_MONT_CTX_N0_LIMBS 2
+#define BN_DEC_CONV (1000000000UL)
+#define BN_DEC_NUM 9
+#define TOBN(hi, lo) (lo), (hi)
+
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
+#define BN_CAN_USE_INLINE_ASM
+#endif
+
+// MOD_EXP_CTIME_ALIGN is the alignment needed for |BN_mod_exp_mont_consttime|'s
+// tables.
+//
+// TODO(davidben): Historically, this alignment came from cache line
+// assumptions, which we've since removed. Is 64-byte alignment still necessary
+// or ideal? The true alignment requirement seems to now be 32 bytes, coming
+// from RSAZ's use of VMOVDQA to a YMM register. Non-x86_64 has even fewer
+// requirements.
+#define MOD_EXP_CTIME_ALIGN 64
+
+// MOD_EXP_CTIME_STORAGE_LEN is the number of |BN_ULONG|s needed for the
+// |BN_mod_exp_mont_consttime| stack-allocated storage buffer. The buffer is
+// just the right size for the RSAZ and is about ~1KB larger than what's
+// necessary (4480 bytes) for 1024-bit inputs.
+#define MOD_EXP_CTIME_STORAGE_LEN \
+  (((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG))
+
+#define STATIC_BIGNUM(x)                                    \
+  {                                                         \
+    (BN_ULONG *)(x), sizeof(x) / sizeof(BN_ULONG),          \
+        sizeof(x) / sizeof(BN_ULONG), 0, BN_FLG_STATIC_DATA \
+  }
+
+#if defined(BN_ULLONG)
+#define Lw(t) ((BN_ULONG)(t))
+#define Hw(t) ((BN_ULONG)((t) >> BN_BITS2))
+#endif
+
+#define BN_GENCB_UNSET 0
+#define BN_GENCB_NEW_STYLE 1
+#define BN_GENCB_OLD_STYLE 2
+
+// bn_minimal_width returns the minimal number of words needed to represent
+// |bn|.
+int bn_minimal_width(const BIGNUM *bn);
+
+// bn_set_minimal_width sets |bn->width| to |bn_minimal_width(bn)|. If |bn| is
+// zero, |bn->neg| is set to zero.
+void bn_set_minimal_width(BIGNUM *bn);
+
+// bn_wexpand ensures that |bn| has at least |words| works of space without
+// altering its value. It returns one on success or zero on allocation
+// failure.
+int bn_wexpand(BIGNUM *bn, size_t words);
+
+// bn_expand acts the same as |bn_wexpand|, but takes a number of bits rather
+// than a number of words.
+int bn_expand(BIGNUM *bn, size_t bits);
+
+// bn_resize_words adjusts |bn->width| to be |words|. It returns one on success
+// and zero on allocation error or if |bn|'s value is too large.
+OPENSSL_EXPORT int bn_resize_words(BIGNUM *bn, size_t words);
+
+// bn_select_words sets |r| to |a| if |mask| is all ones or |b| if |mask| is
+// all zeros.
+void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
+                     const BN_ULONG *b, size_t num);
+
+// bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
+// least significant word first.
+int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
+
+// bn_set_static_words acts like |bn_set_words|, but doesn't copy the data. A
+// flag is set on |bn| so that |BN_free| won't attempt to free the data.
+//
+// The |STATIC_BIGNUM| macro is probably a better solution for this outside of
+// the FIPS module. Inside of the FIPS module that macro generates rel.ro data,
+// which doesn't work with FIPS requirements.
+void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
+
+// bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
+// a sign bit, and zero otherwise.
+int bn_fits_in_words(const BIGNUM *bn, size_t num);
+
+// bn_copy_words copies the value of |bn| to |out| and returns one if the value
+// is representable in |num| words. Otherwise, it returns zero.
+int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn);
+
+// bn_assert_fits_in_bytes asserts that |bn| fits in |num| bytes. This is a
+// no-op in release builds, but triggers an assert in debug builds, and
+// declassifies all bytes which are therefore known to be zero in constant-time
+// validation.
+OPENSSL_EXPORT void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num);
+
+// bn_secret marks |bn|'s contents, but not its width or sign, as secret. See
+// |CONSTTIME_SECRET| for details.
+OPENSSL_INLINE void bn_secret(BIGNUM *bn) {
+  CONSTTIME_SECRET(bn->d, bn->width * sizeof(BN_ULONG));
+}
+
+// bn_declassify marks |bn|'s value as public. See |CONSTTIME_DECLASSIFY| for
+// details.
+OPENSSL_INLINE void bn_declassify(BIGNUM *bn) {
+  CONSTTIME_DECLASSIFY(bn->d, bn->width * sizeof(BN_ULONG));
+}
+
+// bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
+// the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
+// the carry word of the operation. |ap| and |rp| may be equal but otherwise may
+// not alias.
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                          BN_ULONG w);
+
+// bn_mul_words multiples |ap| by |w| and places the result in |rp|. |ap| and
+// |rp| must both be |num| words long. It returns the carry word of the
+// operation. |ap| and |rp| may be equal but otherwise may not alias.
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, BN_ULONG w);
+
+// bn_sqr_words sets |rp[2*i]| and |rp[2*i+1]| to |ap[i]|'s square, for all |i|
+// up to |num|. |ap| is an array of |num| words and |rp| an array of |2*num|
+// words. |ap| and |rp| may not alias.
+//
+// This gives the contribution of the |ap[i]*ap[i]| terms when squaring |ap|.
+void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num);
+
+// bn_add_words adds |ap| to |bp| and places the result in |rp|, each of which
+// are |num| words long. It returns the carry bit, which is one if the operation
+// overflowed and zero otherwise. Any pair of |ap|, |bp|, and |rp| may be equal
+// to each other but otherwise may not alias.
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t num);
+
+// bn_sub_words subtracts |bp| from |ap| and places the result in |rp|. It
+// returns the borrow bit, which is one if the computation underflowed and zero
+// otherwise. Any pair of |ap|, |bp|, and |rp| may be equal to each other but
+// otherwise may not alias.
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t num);
+
+// bn_mul_comba4 sets |r| to the product of |a| and |b|.
+void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]);
+
+// bn_mul_comba8 sets |r| to the product of |a| and |b|.
+void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]);
+
+// bn_sqr_comba8 sets |r| to |a|^2.
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]);
+
+// bn_sqr_comba4 sets |r| to |a|^2.
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);
+
+// bn_less_than_words returns one if |a| < |b| and zero otherwise, where |a|
+// and |b| both are |len| words long. It runs in constant time.
+int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len);
+
+// bn_in_range_words returns one if |min_inclusive| <= |a| < |max_exclusive|,
+// where |a| and |max_exclusive| both are |len| words long. |a| and
+// |max_exclusive| are treated as secret.
+int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive,
+                      const BN_ULONG *max_exclusive, size_t len);
+
+// bn_rand_range_words sets |out| to a uniformly distributed random number from
+// |min_inclusive| to |max_exclusive|. Both |out| and |max_exclusive| are |len|
+// words long.
+//
+// This function runs in time independent of the result, but |min_inclusive| and
+// |max_exclusive| are public data. (Information about the range is unavoidably
+// leaked by how many iterations it took to select a number.)
+int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
+                        const BN_ULONG *max_exclusive, size_t len,
+                        const uint8_t additional_data[RAND_PRED_RESISTANCE_LEN]);
+
+// bn_range_secret_range behaves like |BN_rand_range_ex|, but treats
+// |max_exclusive| as secret. Because of this constraint, the distribution of
+// values returned is more complex.
+//
+// Rather than repeatedly generating values until one is in range, which would
+// leak information, it generates one value. If the value is in range, it sets
+// |*out_is_uniform| to one. Otherwise, it sets |*out_is_uniform| to zero,
+// fixing up the value to force it in range.
+//
+// The subset of calls to |bn_rand_secret_range| which set |*out_is_uniform| to
+// one are uniformly distributed in the target range. Calls overall are not.
+// This function is intended for use in situations where the extra values are
+// still usable and where the number of iterations needed to reach the target
+// number of uniform outputs may be blinded for negligible probabilities of
+// timing leaks.
+//
+// Although this function treats |max_exclusive| as secret, it treats the number
+// of bits in |max_exclusive| as public.
+int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
+                         const BIGNUM *max_exclusive);
+
+// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM|
+// used with Montgomery reduction. Ideally this limit would be applied to all
+// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB
+// values for other operations.
+#define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG))
+
+#if !defined(OPENSSL_NO_ASM) &&                         \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
+     defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
+#define OPENSSL_BN_ASM_MONT
+// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
+// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the
+// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles
+// inputs of this size and zero otherwise.
+//
+// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
+// If neither is fully-reduced, the output may not be either.
+//
+// This function allocates |num| words on the stack, so |num| should be at most
+// |BN_MONTGOMERY_MAX_WORDS|.
+//
+// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
+// off upper bits. The aarch64 implementation expects a 64-bit input and does
+// not. |size_t| is the safer option but not strictly correct for x86_64. But
+// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
+//
+// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
+// inputs.
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+
+#if defined(OPENSSL_X86_64)
+OPENSSL_INLINE int bn_mulx_adx_capable(void) {
+  // MULX is in BMI2.
+  return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable();
+}
+int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                     const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+OPENSSL_INLINE int bn_mul4x_mont_capable(size_t num) {
+  return (num >= 8) && ((num & 3) == 0);
+}
+int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+OPENSSL_INLINE int bn_mulx4x_mont_capable(size_t num) {
+  return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable();
+}
+int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+OPENSSL_INLINE int bn_sqr8x_mont_capable(size_t num) {
+  return (num >= 8) && ((num & 7) == 0);
+}
+int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
+                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+#endif // !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+#elif defined(OPENSSL_ARM)
+  OPENSSL_INLINE int bn_mul8x_mont_neon_capable(size_t num) {
+    return (num & 7) == 0 && CRYPTO_is_NEON_capable();
+  }
+  int bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                         const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+  int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                       const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+#endif // defined(OPENSSL_X86_64)
+
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+#define OPENSSL_BN_ASM_MONT5
+
+// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
+// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
+// words long and represented in Montgomery form. |n0| is a pointer to the
+// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
+// 16 bytes. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
+                         const BN_ULONG *table, const BN_ULONG *np,
+                         const BN_ULONG *n0, int num, int power);
+
+// bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
+// |table| are |num| words long. |power| must be less than 32 and is treated as
+// public. |table| must be 32*|num| words long. |table| must be aligned to at
+// least 16 bytes.
+void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table,
+                 size_t power);
+
+// bn_gather5 loads index |power| of |table| and stores it in |out|. |out| and
+// each entry of |table| are |num| words long. |power| must be less than 32 and
+// is treated as secret. |table| must be aligned to at least 16 bytes.
+void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
+
+// bn_power5 squares |ap| five times and multiplies it by the value stored at
+// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
+// values are |num| words long and represented in Montgomery form. |n0| is a
+// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
+// by 8. |power| must be less than 32 and is treated as secret.
+//
+// WARNING: This function implements Almost Montgomery Multiplication from
+// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
+// However, even if they are fully reduced, the output may not be.
+void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
+               const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
+#endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64
+
+uint64_t bn_mont_n0(const BIGNUM *n);
+
+// bn_mont_ctx_set_RR_consttime initializes |mont->RR|. It returns one on
+// success and zero on error. |mont->N| and |mont->n0| must have been
+// initialized already. The bit width of |mont->N| is assumed public, but
+// |mont->N| is otherwise treated as secret.
+int bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx);
+
+#if defined(_MSC_VER)
+#if defined(OPENSSL_X86_64)
+#define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
+#elif defined(OPENSSL_AARCH64)
+#define BN_UMULT_LOHI(low, high, a, b) \
+  do {                                 \
+    const BN_ULONG _a = (a);           \
+    const BN_ULONG _b = (b);           \
+    (low) = _a * _b;                   \
+    (high) = __umulh(_a, _b);          \
+  } while (0)
+#endif
+#endif  // _MSC_VER
+
+#if !defined(BN_ULLONG) && !defined(BN_UMULT_LOHI)
+#error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
+#endif
+
+// bn_jacobi returns the Jacobi symbol of |a| and |b| (which is -1, 0 or 1), or
+// -2 on error.
+int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+// bn_is_bit_set_words returns one if bit |bit| is set in |a| and zero
+// otherwise.
+int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit);
+
+// bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on
+// success and zero on error. This function treats the bit width of the modulus
+// as public.
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+// bn_less_than_montgomery_R returns one if |bn| is less than the Montgomery R
+// value for |mont| and zero otherwise.
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont);
+
+// bn_mod_u16_consttime returns |bn| mod |d|, ignoring |bn|'s sign bit. It runs
+// in time independent of the value of |bn|, but it treats |d| as public.
+OPENSSL_EXPORT uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d);
+
+// bn_odd_number_is_obviously_composite returns one if |bn| is divisible by one
+// of the first several odd primes and zero otherwise.
+int bn_odd_number_is_obviously_composite(const BIGNUM *bn);
+
+// A BN_MILLER_RABIN stores state common to each Miller-Rabin iteration. It is
+// initialized within an existing |BN_CTX| scope and may not be used after
+// that scope is released with |BN_CTX_end|. Field names match those in FIPS
+// 186-4, section C.3.1.
+typedef struct {
+  // w1 is w-1.
+  BIGNUM *w1;
+  // m is (w-1)/2^a.
+  BIGNUM *m;
+  // one_mont is 1 (mod w) in Montgomery form.
+  BIGNUM *one_mont;
+  // w1_mont is w-1 (mod w) in Montgomery form.
+  BIGNUM *w1_mont;
+  // w_bits is BN_num_bits(w).
+  int w_bits;
+  // a is the largest integer such that 2^a divides w-1.
+  int a;
+} BN_MILLER_RABIN;
+
+// bn_miller_rabin_init initializes |miller_rabin| for testing if |mont->N| is
+// prime. It returns one on success and zero on error.
+OPENSSL_EXPORT int bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin,
+                                        const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+// bn_miller_rabin_iteration performs one Miller-Rabin iteration, checking if
+// |b| is a composite witness for |mont->N|. |miller_rabin| must have been
+// initialized with |bn_miller_rabin_setup|. On success, it returns one and sets
+// |*out_is_possibly_prime| to one if |mont->N| may still be prime or zero if
+// |b| shows it is composite. On allocation or internal failure, it returns
+// zero.
+OPENSSL_EXPORT int bn_miller_rabin_iteration(
+    const BN_MILLER_RABIN *miller_rabin, int *out_is_possibly_prime,
+    const BIGNUM *b, const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+// bn_rshift1_words sets |r| to |a| >> 1, where both arrays are |num| bits wide.
+void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num);
+
+// bn_rshift_words sets |r| to |a| >> |shift|, where both arrays are |num| bits
+// wide.
+void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift,
+                     size_t num);
+
+// bn_rshift_secret_shift behaves like |BN_rshift| but runs in time independent
+// of both |a| and |n|.
+OPENSSL_EXPORT int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a,
+                                          unsigned n, BN_CTX *ctx);
+
+// bn_reduce_once sets |r| to |a| mod |m| where 0 <= |a| < 2*|m|. It returns
+// zero if |a| < |m| and a mask of all ones if |a| >= |m|. Each array is |num|
+// words long, but |a| has an additional word specified by |carry|. |carry| must
+// be zero or one, as implied by the bounds on |a|.
+//
+// |r|, |a|, and |m| may not alias. Use |bn_reduce_once_in_place| if |r| and |a|
+// must alias.
+BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
+                        const BN_ULONG *m, size_t num);
+
+// bn_reduce_once_in_place behaves like |bn_reduce_once| but acts in-place on
+// |r|, using |tmp| as scratch space. |r|, |tmp|, and |m| may not alias.
+BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
+                                 BN_ULONG *tmp, size_t num);
+
+
+// Constant-time non-modular arithmetic.
+//
+// The following functions implement non-modular arithmetic in constant-time
+// and pessimally set |r->width| to the largest possible word size.
+//
+// Note this means that, e.g., repeatedly multiplying by one will cause widths
+// to increase without bound. The corresponding public API functions minimize
+// their outputs to avoid regressing calculator consumers.
+
+// bn_uadd_consttime behaves like |BN_uadd|, but it pessimally sets
+// |r->width| = |a->width| + |b->width| + 1.
+int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+// bn_usub_consttime behaves like |BN_usub|, but it pessimally sets
+// |r->width| = |a->width|.
+int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+// bn_abs_sub_consttime sets |r| to the absolute value of |a| - |b|, treating
+// both inputs as secret. It returns one on success and zero on error.
+OPENSSL_EXPORT int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a,
+                                        const BIGNUM *b, BN_CTX *ctx);
+
+// bn_mul_consttime behaves like |BN_mul|, but it rejects negative inputs and
+// pessimally sets |r->width| to |a->width| + |b->width|, to avoid leaking
+// information about |a| and |b|.
+int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+// bn_sqrt_consttime behaves like |BN_sqrt|, but it pessimally sets |r->width|
+// to 2*|a->width|, to avoid leaking information about |a| and |b|.
+int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
+
+// bn_div_consttime behaves like |BN_div|, but it rejects negative inputs and
+// treats both inputs, including their magnitudes, as secret. It is, as a
+// result, much slower than |BN_div| and should only be used for rare operations
+// where Montgomery reduction is not available. |divisor_min_bits| is a
+// public lower bound for |BN_num_bits(divisor)|. When |divisor|'s bit width is
+// public, this can speed up the operation.
+//
+// Note that |quotient->width| will be set pessimally to |numerator->width|.
+OPENSSL_EXPORT int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
+                                    const BIGNUM *numerator,
+                                    const BIGNUM *divisor,
+                                    unsigned divisor_min_bits, BN_CTX *ctx);
+
+// bn_is_relatively_prime checks whether GCD(|x|, |y|) is one. On success, it
+// returns one and sets |*out_relatively_prime| to one if the GCD was one and
+// zero otherwise. On error, it returns zero.
+OPENSSL_EXPORT int bn_is_relatively_prime(int *out_relatively_prime,
+                                          const BIGNUM *x, const BIGNUM *y,
+                                          BN_CTX *ctx);
+
+// bn_lcm_consttime sets |r| to LCM(|a|, |b|). It returns one and success and
+// zero on error. |a| and |b| are both treated as secret.
+OPENSSL_EXPORT int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                                    BN_CTX *ctx);
+
+// bn_mont_ctx_init zero-initialies |mont|.
+void bn_mont_ctx_init(BN_MONT_CTX *mont);
+
+// bn_mont_ctx_cleanup releases memory associated with |mont|, without freeing
+// |mont| itself.
+void bn_mont_ctx_cleanup(BN_MONT_CTX *mont);
+
+
+// Constant-time modular arithmetic.
+//
+// The following functions implement basic constant-time modular arithmetic.
+
+// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num);
+
+// bn_mod_add_consttime acts like |BN_mod_add_quick| but takes a |BN_CTX|.
+int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_sub_words sets |r| to |a| - |b| (mod |m|), using |tmp| as scratch
+// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
+// |r|, |a|, and |b| may alias.
+void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      const BN_ULONG *m, BN_ULONG *tmp, size_t num);
+
+// bn_mod_sub_consttime acts like |BN_mod_sub_quick| but takes a |BN_CTX|.
+int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         const BIGNUM *m, BN_CTX *ctx);
+
+// bn_mod_lshift1_consttime acts like |BN_mod_lshift1_quick| but takes a
+// |BN_CTX|.
+int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
+                             BN_CTX *ctx);
+
+// bn_mod_lshift_consttime acts like |BN_mod_lshift_quick| but takes a |BN_CTX|.
+int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                            BN_CTX *ctx);
+
+// bn_mod_inverse_consttime sets |r| to |a|^-1, mod |n|. |a| must be non-
+// negative and less than |n|. It returns one on success and zero on error. On
+// failure, if the failure was caused by |a| having no inverse mod |n| then
+// |*out_no_inverse| will be set to one; otherwise it will be set to zero.
+//
+// This function treats both |a| and |n| as secret, provided they are both non-
+// zero and the inverse exists. It should only be used for even moduli where
+// none of the less general implementations are applicable.
+OPENSSL_EXPORT int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse,
+                                            const BIGNUM *a, const BIGNUM *n,
+                                            BN_CTX *ctx);
+
+// bn_mod_inverse_prime sets |out| to the modular inverse of |a| modulo |p|,
+// computed with Fermat's Little Theorem. It returns one on success and zero on
+// error. If |mont_p| is NULL, one will be computed temporarily.
+int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                         BN_CTX *ctx, const BN_MONT_CTX *mont_p);
+
+// bn_mod_inverse_secret_prime behaves like |bn_mod_inverse_prime| but uses
+// |BN_mod_exp_mont_consttime| instead of |BN_mod_exp_mont| in hopes of
+// protecting the exponent.
+int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
+                                BN_CTX *ctx, const BN_MONT_CTX *mont_p);
+
+// BN_MONT_CTX_set_locked takes |lock| and checks whether |*pmont| is NULL. If
+// so, it creates a new |BN_MONT_CTX| and sets the modulus for it to |mod|. It
+// then stores it as |*pmont|. It returns one on success and zero on error. Note
+// this function assumes |mod| is public.
+//
+// If |*pmont| is already non-NULL then it does nothing and returns one.
+int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
+                           const BIGNUM *mod, BN_CTX *bn_ctx);
+
+
+// Low-level operations for small numbers.
+//
+// The following functions implement algorithms suitable for use with scalars
+// and field elements in elliptic curves. They rely on the number being small
+// both to stack-allocate various temporaries and because they do not implement
+// optimizations useful for the larger values used in RSA.
+
+// BN_SMALL_MAX_WORDS is the largest size input these functions handle. This
+// limit allows temporaries to be more easily stack-allocated. This limit is set
+// to accommodate P-521.
+#if defined(OPENSSL_32_BIT)
+#define BN_SMALL_MAX_WORDS 17
+#else
+#define BN_SMALL_MAX_WORDS 9
+#endif
+
+// bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
+// not alias with |a| or |b|.
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                 const BN_ULONG *b, size_t num_b);
+
+// bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
+// |num_r| must be |num_a|*2. |r| and |a| may not alias.
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
+
+// In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS|
+// words long.
+
+// bn_to_montgomery_small sets |r| to |a| translated to the Montgomery domain.
+// |r| and |a| are |num| words long, which must be |mont->N.width|. |a| must be
+// fully reduced and may alias |r|.
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont);
+
+// bn_from_montgomery_small sets |r| to |a| translated out of the Montgomery
+// domain. |r| and |a| are |num_r| and |num_a| words long, respectively. |num_r|
+// must be |mont->N.width|. |a| must be at most |mont->N|^2 and may alias |r|.
+//
+// Unlike most of these functions, only |num_r| is bounded by
+// |BN_SMALL_MAX_WORDS|. |num_a| may exceed it, but must be at most 2 * |num_r|.
+void bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
+                              size_t num_a, const BN_MONT_CTX *mont);
+
+// bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
+// and outputs are in the Montgomery domain. Each array is |num| words long,
+// which must be |mont->N.width|. Any two of |r|, |a|, and |b| may alias. |a|
+// and |b| must be reduced on input.
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont);
+
+// bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on
+// success and zero on programmer or internal error. Both inputs and outputs are
+// in the Montgomery domain. |r| and |a| are |num| words long, which must be
+// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |num_p|, measured in bits,
+// must fit in |size_t|. |a| must be fully-reduced. This function runs in time
+// independent of |a|, but |p| and |mont->N| are public values. |a| must be
+// fully-reduced and may alias with |r|.
+//
+// Note this function differs from |BN_mod_exp_mont| which uses Montgomery
+// reduction but takes input and output outside the Montgomery domain. Combine
+// this function with |bn_from_montgomery_small| and |bn_to_montgomery_small|
+// if necessary.
+void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                           const BN_ULONG *p, size_t num_p,
+                           const BN_MONT_CTX *mont);
+
+// bn_mod_inverse0_prime_mont_small sets |r| to |a|^-1 mod |mont->N|. If |a| is
+// zero, |r| is set to zero. |mont->N| must be a prime. |r| and |a| are |num|
+// words long, which must be |mont->N.width| and at most |BN_SMALL_MAX_WORDS|.
+// |a| must be fully-reduced and may alias |r|. This function runs in time
+// independent of |a|, but |mont->N| is a public value.
+void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a,
+                                      size_t num, const BN_MONT_CTX *mont);
+
+
+// Word-based byte conversion functions.
+
+// bn_big_endian_to_words interprets |in_len| bytes from |in| as a big-endian,
+// unsigned integer and writes the result to |out_len| words in |out|. The output
+// is in little-endian word order with |out[0]| being the least-significant word.
+// |out_len| must be large enough to represent any |in_len|-byte value. That is,
+// |in_len| must be at most |BN_BYTES * out_len|.
+void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
+                            size_t in_len);
+
+// bn_words_to_big_endian represents |in_len| words from |in| (in little-endian
+// word order) as a big-endian, unsigned integer in |out_len| bytes. It writes
+// the result to |out|. |out_len| must be large enough to represent |in| without
+// truncation.
+//
+// Note |out_len| may be less than |BN_BYTES * in_len| if |in| is known to have
+// leading zeros.
+void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
+                            size_t in_len);
+
+// bn_little_endian_to_words interprets |in_len| bytes from |in| as a little-endian,
+// unsigned integer and writes the result to |out_len| words in |out|.  The output
+// is in little-endian word order with |out[0]| being the least-significant word.
+// |out_len| must be large enough to represent any |in_len|-byte value. That is,
+// |out_len| must be at least |BN_BYTES * in_len|.
+void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, const size_t in_len);
+
+// bn_words_to_little_endian represents |in_len| words from |in| (in little-endian
+// word order) as a little-endian, unsigned integer in |out_len| bytes. It
+// writes the result to |out|. |out_len| must be large enough to represent |in|
+// without truncation.
+//
+// Note |out_len| may be less than |BN_BYTES * in_len| if |in| is known to have
+// leading zeros.
+void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len);
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_BN_INTERNAL_H
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/jacobi.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/jacobi.c
@@ -0,0 +1,97 @@
+// Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+// least significant word
+#define BN_lsw(n) (((n)->width == 0) ? (BN_ULONG) 0 : (n)->d[0])
+
+int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  // In 'tab', only odd-indexed entries are relevant:
+  // For any odd BIGNUM n,
+  //     tab[BN_lsw(n) & 7]
+  // is $(-1)^{(n^2-1)/8}$ (using TeX notation).
+  // Note that the sign of n does not matter.
+  static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
+
+  // The Jacobi symbol is only defined for odd modulus.
+  if (!BN_is_odd(b)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return -2;
+  }
+
+  // Require b be positive.
+  if (BN_is_negative(b)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return -2;
+  }
+
+  int ret = -2;
+  BN_CTX_start(ctx);
+  BIGNUM *A = BN_CTX_get(ctx);
+  BIGNUM *B = BN_CTX_get(ctx);
+  if (B == NULL) {
+    goto end;
+  }
+
+  if (!BN_copy(A, a) ||
+      !BN_copy(B, b)) {
+    goto end;
+  }
+
+  // Adapted from logic to compute the Kronecker symbol, originally implemented
+  // according to Henri Cohen, "A Course in Computational Algebraic Number
+  // Theory" (algorithm 1.4.10).
+
+  ret = 1;
+
+  while (1) {
+    // Cohen's step 3:
+
+    // B is positive and odd
+    if (BN_is_zero(A)) {
+      ret = BN_is_one(B) ? ret : 0;
+      goto end;
+    }
+
+    // now A is non-zero
+    int i = 0;
+    while (!BN_is_bit_set(A, i)) {
+      i++;
+    }
+    if (!BN_rshift(A, A, i)) {
+      ret = -2;
+      goto end;
+    }
+    if (i & 1) {
+      // i is odd
+      // multiply 'ret' by  $(-1)^{(B^2-1)/8}$
+      ret = ret * tab[BN_lsw(B) & 7];
+    }
+
+    // Cohen's step 4:
+    // multiply 'ret' by  $(-1)^{(A-1)(B-1)/4}$
+    if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) {
+      ret = -ret;
+    }
+
+    // (A, B) := (B mod |A|, |A|)
+    if (!BN_nnmod(B, B, A, ctx)) {
+      ret = -2;
+      goto end;
+    }
+    BIGNUM *tmp = A;
+    A = B;
+    B = tmp;
+    tmp->neg = 0;
+  }
+
+end:
+  BN_CTX_end(ctx);
+  return ret;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/montgomery.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/montgomery.c
@@ -0,0 +1,550 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+// Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+#include <openssl/thread.h>
+#include <openssl/type_check.h>
+
+#include "internal.h"
+#include "../cpucap/internal.h"
+#include "../../internal.h"
+
+#if !defined(OPENSSL_NO_ASM) &&                          \
+    (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE) || \
+     defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD) || \
+     defined(OPENSSL_NETBSD) ) &&                        \
+    defined(OPENSSL_AARCH64) && defined(OPENSSL_BN_ASM_MONT)
+
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
+
+#define BN_MONTGOMERY_S2N_BIGNUM_CAPABLE 1
+
+OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
+  // Use s2n-bignum's functions only if
+  // (1) The ARM architecture has slow multipliers, and
+  // (2) num (which is the number of words) is multiplie of 8, because
+  //     s2n-bignum's bignum_emontredc_8n requires it, and
+  // (3) The word size is 64 bits.
+  // (4) CPU has NEON.
+  assert(S2NBIGNUM_KSQR_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
+         S2NBIGNUM_KSQR_32_64_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
+         S2NBIGNUM_KMUL_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS);
+  assert(BN_BITS2 == 64);
+  return !CRYPTO_is_ARMv8_wide_multiplier_capable() &&
+          (num % 8 == 0) &&
+          CRYPTO_is_NEON_capable();
+}
+
+#else
+
+OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
+  return 0;
+}
+
+#endif
+
+void bn_mont_ctx_init(BN_MONT_CTX *mont) {
+  OPENSSL_memset(mont, 0, sizeof(BN_MONT_CTX));
+  BN_init(&mont->RR);
+  BN_init(&mont->N);
+}
+
+void bn_mont_ctx_cleanup(BN_MONT_CTX *mont) {
+  BN_free(&mont->RR);
+  BN_free(&mont->N);
+}
+
+BN_MONT_CTX *BN_MONT_CTX_new(void) {
+  BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
+  if (ret == NULL) {
+    return NULL;
+  }
+
+  bn_mont_ctx_init(ret);
+  return ret;
+}
+
+void BN_MONT_CTX_free(BN_MONT_CTX *mont) {
+  if (mont == NULL) {
+    return;
+  }
+
+  bn_mont_ctx_cleanup(mont);
+  OPENSSL_free(mont);
+}
+
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, const BN_MONT_CTX *from) {
+  if (to == from) {
+    return to;
+  }
+
+  if (!BN_copy(&to->RR, &from->RR) ||
+      !BN_copy(&to->N, &from->N)) {
+    return NULL;
+  }
+  to->n0[0] = from->n0[0];
+  to->n0[1] = from->n0[1];
+  return to;
+}
+
+static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) {
+  if (BN_is_zero(mod)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
+    return 0;
+  }
+  if (!BN_is_odd(mod)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+  if (BN_is_negative(mod)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+  if (!bn_fits_in_words(mod, BN_MONTGOMERY_MAX_WORDS)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+
+  // Save the modulus.
+  if (!BN_copy(&mont->N, mod)) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
+    return 0;
+  }
+  // |mont->N| is always stored minimally. Computing RR efficiently leaks the
+  // size of the modulus. While the modulus may be private in RSA (one of the
+  // primes), their sizes are public, so this is fine.
+  bn_set_minimal_width(&mont->N);
+
+  // Find n0 such that n0 * N == -1 (mod r).
+  //
+  // Only certain BN_BITS2<=32 platforms actually make use of n0[1]. For the
+  // others, we could use a shorter R value and use faster |BN_ULONG|-based
+  // math instead of |uint64_t|-based math, which would be double-precision.
+  // However, currently only the assembler files know which is which.
+  OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
+                        BN_MONT_CTX_N0_LIMBS_value_is_invalid)
+  OPENSSL_STATIC_ASSERT(
+      sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
+      uint64_t_is_insufficient_precision_for_n0);
+  uint64_t n0 = bn_mont_n0(&mont->N);
+  mont->n0[0] = (BN_ULONG)n0;
+#if BN_MONT_CTX_N0_LIMBS == 2
+  mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2);
+#else
+  mont->n0[1] = 0;
+#endif
+  return 1;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+  if (!bn_mont_ctx_set_N_and_n0(mont, mod)) {
+    return 0;
+  }
+
+  BN_CTX *new_ctx = NULL;
+  if (ctx == NULL) {
+    new_ctx = BN_CTX_new();
+    if (new_ctx == NULL) {
+      return 0;
+    }
+    ctx = new_ctx;
+  }
+
+  // Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS2 such that R
+  // > mod. Even though the assembly on some 32-bit platforms works with 64-bit
+  // values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS *
+  // BN_BITS2|, is correct because R**2 will still be a multiple of the latter
+  // as |BN_MONT_CTX_N0_LIMBS| is either one or two.
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  BN_zero(&mont->RR);
+  int ok = BN_set_bit(&mont->RR, lgBigR * 2) &&
+           BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) &&
+           bn_resize_words(&mont->RR, mont->N.width);
+  BN_CTX_free(new_ctx);
+  return ok;
+}
+
+BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !BN_MONT_CTX_set(mont, mod, ctx)) {
+    BN_MONT_CTX_free(mont);
+    return NULL;
+  }
+  return mont;
+}
+
+BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) {
+  BN_MONT_CTX *mont = BN_MONT_CTX_new();
+  if (mont == NULL ||
+      !bn_mont_ctx_set_N_and_n0(mont, mod) ||
+      !bn_mont_ctx_set_RR_consttime(mont, ctx)) {
+    BN_MONT_CTX_free(mont);
+    return NULL;
+  }
+  return mont;
+}
+
+int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
+                           const BIGNUM *mod, BN_CTX *bn_ctx) {
+  CRYPTO_MUTEX_lock_read(lock);
+  BN_MONT_CTX *ctx = *pmont;
+  CRYPTO_MUTEX_unlock_read(lock);
+
+  if (ctx) {
+    return 1;
+  }
+
+  CRYPTO_MUTEX_lock_write(lock);
+  if (*pmont == NULL) {
+    *pmont = BN_MONT_CTX_new_for_modulus(mod, bn_ctx);
+  }
+  const int ok = *pmont != NULL;
+  CRYPTO_MUTEX_unlock_write(lock);
+  return ok;
+}
+
+int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+                     BN_CTX *ctx) {
+  return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx);
+}
+
+static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a,
+                                       size_t num_a, const BN_MONT_CTX *mont) {
+  const BN_ULONG *n = mont->N.d;
+  size_t num_n = mont->N.width;
+  if (num_r != num_n || num_a != 2 * num_n) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
+  // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
+  // includes |carry| which is stored separately.
+  BN_ULONG n0 = mont->n0[0];
+  BN_ULONG carry = 0;
+  for (size_t i = 0; i < num_n; i++) {
+    BN_ULONG v = bn_mul_add_words(a + i, n, num_n, a[i] * n0);
+    v += carry + a[i + num_n];
+    carry |= (v != a[i + num_n]);
+    carry &= (v <= a[i + num_n]);
+    a[i + num_n] = v;
+  }
+
+  // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
+  // includes |carry| which is stored separately.
+  a += num_n;
+
+  // |a| thus requires at most one additional subtraction |n| to be reduced.
+  bn_reduce_once(r, a, carry, n, num_n);
+  return 1;
+}
+
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
+                                   const BN_MONT_CTX *mont) {
+  if (r->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  const BIGNUM *n = &mont->N;
+  if (n->width == 0) {
+    ret->width = 0;
+    return 1;
+  }
+
+  int max = 2 * n->width;  // carry is stored separately
+  if (!bn_resize_words(r, max) ||
+      !bn_wexpand(ret, n->width)) {
+    return 0;
+  }
+
+  ret->width = n->width;
+  ret->neg = 0;
+  return bn_from_montgomery_in_place(ret->d, ret->width, r->d, r->width, mont);
+}
+
+int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont,
+                       BN_CTX *ctx) {
+  int ret = 0;
+  BIGNUM *t;
+
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (t == NULL ||
+      !BN_copy(t, a)) {
+    goto err;
+  }
+
+  ret = BN_from_montgomery_word(r, t, mont);
+
+err:
+  BN_CTX_end(ctx);
+
+  return ret;
+}
+
+int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  // If the high bit of |n| is set, R = 2^(width*BN_BITS2) < 2 * |n|, so we
+  // compute R - |n| rather than perform Montgomery reduction.
+  const BIGNUM *n = &mont->N;
+  if (n->width > 0 && (n->d[n->width - 1] >> (BN_BITS2 - 1)) != 0) {
+    if (!bn_wexpand(r, n->width)) {
+      return 0;
+    }
+    r->d[0] = 0 - n->d[0];
+    for (int i = 1; i < n->width; i++) {
+      r->d[i] = ~n->d[i];
+    }
+    r->width = n->width;
+    r->neg = 0;
+    return 1;
+  }
+
+  return BN_from_montgomery(r, &mont->RR, mont, ctx);
+}
+
+static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
+                                          const BIGNUM *b,
+                                          const BN_MONT_CTX *mont,
+                                          BN_CTX *ctx) {
+  int ret = 0;
+
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (tmp == NULL) {
+    goto err;
+  }
+
+  if (a == b) {
+    if (!bn_sqr_consttime(tmp, a, ctx)) {
+      goto err;
+    }
+  } else {
+    if (!bn_mul_consttime(tmp, a, b, ctx)) {
+      goto err;
+    }
+  }
+
+  // reduce from aRR to aR
+  if (!BN_from_montgomery_word(r, tmp, mont)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+
+#if defined(OPENSSL_BN_ASM_MONT)
+
+// Perform montgomery multiplication using s2n-bignum functions. The arguments
+// are equivalent to the arguments of bn_mul_mont.
+// montgomery_s2n_bignum_mul_mont works only if num is a multiple of 8.
+// montgomery_use_s2n_bignum(num) must be called in advance to check this
+// condition, as well as other s2n-bignum requirements.
+// For num = 32 or num = 16, this uses faster primitives in s2n-bignum.
+// montgomery_s2n_bignum_mul_mont allocates S2NBIGNUM_KMUL_32_64_TEMP_NWORDS +
+// 2 * BN_MONTGOMERY_MAX_WORDS uint64_t words at the stack.
+static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
+                                           const BN_ULONG *bp,
+                                           const BN_ULONG *np,
+                                           const BN_ULONG *n0, size_t num) {
+
+#if defined(BN_MONTGOMERY_S2N_BIGNUM_CAPABLE)
+
+  // t is a temporary buffer used by Karatsuba multiplication.
+  // bignum_kmul_32_64 requires S2NBIGNUM_KMUL_32_64_TEMP_NWORDS words.
+  uint64_t t[S2NBIGNUM_KMUL_32_64_TEMP_NWORDS];
+  // mulres is the output buffer of big-int multiplication which uses
+  // 2 * num elements of mulres. Note that num <= BN_MONTGOMERY_MAX_WORDS
+  // is guaranteed by the caller (BN_mod_mul_montgomery).
+  uint64_t mulres[2 * BN_MONTGOMERY_MAX_WORDS];
+
+  // Given m the prime number stored at np, m * w = -1 mod 2^64.
+  uint64_t w = n0[0];
+
+  if (num == 32) {
+    if (ap == bp) {
+      bignum_ksqr_32_64(mulres, ap, t);
+    } else {
+      bignum_kmul_32_64(mulres, ap, bp, t);
+    }
+  } else if (num == 16) {
+    if (ap == bp) {
+      bignum_ksqr_16_32(mulres, ap, t);
+    } else {
+      bignum_kmul_16_32(mulres, ap, bp, t);
+    }
+  } else {
+    if (ap == bp) {
+      bignum_sqr(num * 2, mulres, num, ap);
+    } else {
+      bignum_mul(num * 2, mulres, num, ap, num, bp);
+    }
+  }
+
+  // Do montgomery reduction. We follow the definition of montgomery reduction
+  // which is:
+  // 1. Calculate (mulres + ((mulres mod R) * (-m^-1 mod R) mod R) * m) / R
+  //    using bignum_emontredc_8n, where R is 2^(64*num).
+  //    The calculated result is stored in [mulres+num ... mulres+2*num-1]. If
+  //    the result >= 2^(64*num), bignum_emontredc_8n returns 1.
+  // 2. Optionally subtract the result if the (result of step 1) >= m.
+  //    The comparison is true if either A or B holds:
+  //    A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n
+  //       returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds.
+  //    B. The result of step 1 fits in 2^(64*num), and the result >= m.
+  uint64_t c = bignum_emontredc_8n(num, mulres, np, w); // c: case A
+  c |= bignum_ge(num, mulres + num, num, np);  // c: case B
+  // Optionally subtract and store the result at rp
+  bignum_optsub(num, rp, mulres + num, c, np);
+
+#else
+
+  // Should not call this function unless s2n-bignum is supported.
+  abort();
+
+#endif
+}
+
+#endif
+
+
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  if (a->neg || b->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+#if defined(OPENSSL_BN_ASM_MONT)
+  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  int num = mont->N.width;
+  if (num >= (128 / BN_BITS2) &&
+      a->width == num &&
+      b->width == num) {
+    if (!bn_wexpand(r, num)) {
+      return 0;
+    }
+    // This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
+    // allocates |num| words on the stack, so |num| cannot be too large.
+    assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
+
+    if (montgomery_use_s2n_bignum(num)) {
+      // Do montgomery multiplication using s2n-bignum.
+      montgomery_s2n_bignum_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0,
+                                     num);
+    } else {
+      if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+        // The check above ensures this won't happen.
+        assert(0);
+        OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
+        return 0;
+      }
+    }
+    r->neg = 0;
+    r->width = num;
+    return 1;
+  }
+#endif
+
+  return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
+}
+
+int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) {
+  return !BN_is_negative(bn) &&
+         bn_fits_in_words(bn, mont->N.width);
+}
+
+void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
+                            const BN_MONT_CTX *mont) {
+  bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont);
+}
+
+void bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
+                              size_t num_a, const BN_MONT_CTX *mont) {
+  if (num_r != (size_t)mont->N.width || num_r > BN_SMALL_MAX_WORDS ||
+      num_a > 2 * num_r) {
+    abort();
+  }
+  BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2] = {0};
+  OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG));
+  if (!bn_from_montgomery_in_place(r, num_r, tmp, 2 * num_r, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num_r * sizeof(BN_ULONG));
+}
+
+void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
+                                 const BN_ULONG *b, size_t num,
+                                 const BN_MONT_CTX *mont) {
+  if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
+    abort();
+  }
+
+#if defined(OPENSSL_BN_ASM_MONT)
+  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  if (num >= (128 / BN_BITS2)) {
+    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
+      abort();  // The check above ensures this won't happen.
+    }
+    return;
+  }
+#endif
+
+  // Compute the product.
+  BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
+  if (a == b) {
+    bn_sqr_small(tmp, 2 * num, a, num);
+  } else {
+    bn_mul_small(tmp, 2 * num, a, num, b, num);
+  }
+
+  // Reduce.
+  if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
+    abort();
+  }
+  OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
+}
+
+#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64)
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, size_t num)
+{
+#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ap == bp && bn_sqr8x_mont_capable(num)) {
+    return bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
+  }
+  if (bn_mulx4x_mont_capable(num)) {
+    return bn_mulx4x_mont(rp, ap, bp, np, n0, num);
+  }
+#endif // !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (bn_mul4x_mont_capable(num)) {
+    return bn_mul4x_mont(rp, ap, bp, np, n0, num);
+  }
+  return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#endif
+
+#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM)
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+  if (bn_mul8x_mont_neon_capable(num)) {
+    return bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+  }
+  return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
+}
+#endif
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/montgomery_inv.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/montgomery_inv.c
@@ -0,0 +1,212 @@
+// Copyright 2016 Brian Smith.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+static uint64_t bn_neg_inv_mod_r_u64(uint64_t n);
+
+OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
+                      BN_MONT_CTX_N0_LIMBS_value_is_invalid)
+OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS ==
+                          sizeof(uint64_t),
+                      uint64_t_is_insufficient_precision_for_n0)
+
+// LG_LITTLE_R is log_2(r).
+#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
+
+uint64_t bn_mont_n0(const BIGNUM *n) {
+  // These conditions are checked by the caller, |BN_MONT_CTX_set| or
+  // |BN_MONT_CTX_new_consttime|.
+  assert(!BN_is_zero(n));
+  assert(!BN_is_negative(n));
+  assert(BN_is_odd(n));
+
+  // r == 2**(BN_MONT_CTX_N0_LIMBS * BN_BITS2) and LG_LITTLE_R == lg(r). This
+  // ensures that we can do integer division by |r| by simply ignoring
+  // |BN_MONT_CTX_N0_LIMBS| limbs. Similarly, we can calculate values modulo
+  // |r| by just looking at the lowest |BN_MONT_CTX_N0_LIMBS| limbs. This is
+  // what makes Montgomery multiplication efficient.
+  //
+  // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography
+  // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a
+  // multi-limb Montgomery multiplication of |a * b (mod n)|, given the
+  // unreduced product |t == a * b|, we repeatedly calculate:
+  //
+  //    t1 := t % r         |t1| is |t|'s lowest limb (see previous paragraph).
+  //    t2 := t1*n0*n
+  //    t3 := t + t2
+  //    t := t3 / r         copy all limbs of |t3| except the lowest to |t|.
+  //
+  // In the last step, it would only make sense to ignore the lowest limb of
+  // |t3| if it were zero. The middle steps ensure that this is the case:
+  //
+  //                            t3 ==  0 (mod r)
+  //                        t + t2 ==  0 (mod r)
+  //                   t + t1*n0*n ==  0 (mod r)
+  //                       t1*n0*n == -t (mod r)
+  //                        t*n0*n == -t (mod r)
+  //                          n0*n == -1 (mod r)
+  //                            n0 == -1/n (mod r)
+  //
+  // Thus, in each iteration of the loop, we multiply by the constant factor
+  // |n0|, the negative inverse of n (mod r).
+
+  // n_mod_r = n % r. As explained above, this is done by taking the lowest
+  // |BN_MONT_CTX_N0_LIMBS| limbs of |n|.
+  uint64_t n_mod_r = n->d[0];
+#if BN_MONT_CTX_N0_LIMBS == 2
+  if (n->width > 1) {
+    n_mod_r |= (uint64_t)n->d[1] << BN_BITS2;
+  }
+#endif
+
+  return bn_neg_inv_mod_r_u64(n_mod_r);
+}
+
+// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
+// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
+// must be odd.
+//
+// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
+// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
+// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
+// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
+// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
+//
+// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
+// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
+// constant-time with respect to |n|. We assume uint64_t additions,
+// subtractions, shifts, and bitwise operations are all constant time, which
+// may be a large leap of faith on 32-bit targets. We avoid division and
+// multiplication, which tend to be the most problematic in terms of timing
+// leaks.
+//
+// Most GCD implementations return values such that |u*r + v*n == 1|, so the
+// caller would have to negate the resultant |v| for the purpose of Montgomery
+// multiplication. This implementation does the negation implicitly by doing
+// the computations as a difference instead of a sum.
+static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
+  assert(n % 2 == 1);
+
+  // alpha == 2**(lg r - 1) == r / 2.
+  static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
+
+  const uint64_t beta = n;
+
+  uint64_t u = 1;
+  uint64_t v = 0;
+
+  // The invariant maintained from here on is:
+  // 2**(lg r - i) == u*2*alpha - v*beta.
+  for (size_t i = 0; i < LG_LITTLE_R; ++i) {
+#if BN_BITS2 == 64 && defined(BN_ULLONG)
+    assert((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
+           ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
+#endif
+
+    // Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
+    // |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
+
+    uint64_t u_is_odd = UINT64_C(0) - (u & 1);  // Either 0xff..ff or 0.
+
+    // The addition can overflow, so use Dietz's method for it.
+    //
+    // Dietz calculates (x+y)/2 by (x⊕y)>>1 + x&y. This is valid for all
+    // (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
+    // (embedded in 64 bits to so that overflow can be ignored):
+    //
+    // (declare-fun x () (_ BitVec 64))
+    // (declare-fun y () (_ BitVec 64))
+    // (assert (let (
+    //    (one (_ bv1 64))
+    //    (thirtyTwo (_ bv32 64)))
+    //    (and
+    //      (bvult x (bvshl one thirtyTwo))
+    //      (bvult y (bvshl one thirtyTwo))
+    //      (not (=
+    //        (bvadd (bvlshr (bvxor x y) one) (bvand x y))
+    //        (bvlshr (bvadd x y) one)))
+    // )))
+    // (check-sat)
+    uint64_t beta_if_u_is_odd = beta & u_is_odd;  // Either |beta| or 0.
+    u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
+
+    uint64_t alpha_if_u_is_odd = alpha & u_is_odd;  // Either |alpha| or 0.
+    v = (v >> 1) + alpha_if_u_is_odd;
+  }
+
+  // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
+#if BN_BITS2 == 64 && defined(BN_ULLONG)
+  declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
+#endif
+
+  return v;
+}
+
+int bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx) {
+  assert(!BN_is_zero(&mont->N));
+  assert(!BN_is_negative(&mont->N));
+  assert(BN_is_odd(&mont->N));
+  assert(bn_minimal_width(&mont->N) == mont->N.width);
+
+  unsigned n_bits = BN_num_bits(&mont->N);
+  assert(n_bits != 0);
+  if (n_bits == 1) {
+    BN_zero(&mont->RR);
+    return bn_resize_words(&mont->RR, mont->N.width);
+  }
+
+  unsigned lgBigR = mont->N.width * BN_BITS2;
+  assert(lgBigR >= n_bits);
+
+  // RR is R, or 2^lgBigR, in the Montgomery domain. We can compute 2 in the
+  // Montgomery domain, 2R or 2^(lgBigR+1), and then use Montgomery
+  // square-and-multiply to exponentiate.
+  //
+  // The square steps take 2^n R to (2^n)*(2^n) R = 2^2n R. This is the same as
+  // doubling 2^n R, n times (doubling any x, n times, computes 2^n * x). When n
+  // is below some threshold, doubling is faster; when above, squaring is
+  // faster. From benchmarking various 32-bit and 64-bit architectures, the word
+  // count seems to work well as a threshold. (Doubling scales linearly and
+  // Montgomery reduction scales quadratically, so the threshold should scale
+  // roughly linearly.)
+  //
+  // The multiply steps take 2^n R to 2*2^n R = 2^(n+1) R. It is faster to
+  // double the value instead, so the square-and-multiply exponentiation would
+  // become square-and-double. However, when using the word count as the
+  // threshold, it turns out that no multiply/double steps will be needed at
+  // all, because squaring any x, i times, computes x^(2^i):
+  //
+  //   (2^threshold)^(2^BN_BITS2_LG) R
+  //   (2^mont->N.width)^BN_BITS2 R
+  // = 2^(mont->N.width*BN_BITS2) R
+  // = 2^lgBigR R
+  // = RR
+  int threshold = mont->N.width;
+
+  // Calculate 2^threshold R = 2^(threshold + lgBigR) by doubling. The
+  // first n_bits - 1 doubles can be skipped because we don't need to reduce.
+  if (!BN_set_bit(&mont->RR, n_bits - 1) ||
+      !bn_mod_lshift_consttime(&mont->RR, &mont->RR,
+                               threshold + (lgBigR - (n_bits - 1)),
+                               &mont->N, ctx)) {
+    return 0;
+  }
+
+  // The above steps are the same regardless of the threshold. The steps below
+  // need to be modified if the threshold changes.
+  assert(threshold == mont->N.width);
+  for (unsigned i = 0; i < BN_BITS2_LG; i++) {
+    if (!BN_mod_mul_montgomery(&mont->RR, &mont->RR, &mont->RR, mont, ctx)) {
+      return 0;
+    }
+  }
+
+  return bn_resize_words(&mont->RR, mont->N.width);
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/mul.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/mul.c
@@ -0,0 +1,692 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+#include <openssl/type_check.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+#define BN_MUL_RECURSIVE_SIZE_NORMAL 16
+#define BN_SQR_RECURSIVE_SIZE_NORMAL BN_MUL_RECURSIVE_SIZE_NORMAL
+
+
+static void bn_abs_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             size_t num, BN_ULONG *tmp) {
+  BN_ULONG borrow = bn_sub_words(tmp, a, b, num);
+  bn_sub_words(r, b, a, num);
+  bn_select_words(r, 0 - borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
+}
+
+static void bn_mul_normal(BN_ULONG *r, const BN_ULONG *a, size_t na,
+                          const BN_ULONG *b, size_t nb) {
+  if (na < nb) {
+    size_t itmp = na;
+    na = nb;
+    nb = itmp;
+    const BN_ULONG *ltmp = a;
+    a = b;
+    b = ltmp;
+  }
+  BN_ULONG *rr = &(r[na]);
+  if (nb == 0) {
+    OPENSSL_memset(r, 0, na * sizeof(BN_ULONG));
+    return;
+  }
+  rr[0] = bn_mul_words(r, a, na, b[0]);
+
+  for (;;) {
+    if (--nb == 0) {
+      return;
+    }
+    rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
+    if (--nb == 0) {
+      return;
+    }
+    rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
+    if (--nb == 0) {
+      return;
+    }
+    rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
+    if (--nb == 0) {
+      return;
+    }
+    rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
+    rr += 4;
+    r += 4;
+    b += 4;
+  }
+}
+
+// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is
+// one if the operation underflowed and zero otherwise. |cl| is the common
+// length, that is, the shorter of len(a) or len(b). |dl| is the delta length,
+// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or
+// cl + abs(dl).
+//
+// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
+// is confusing.
+static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
+                                  const BN_ULONG *b, int cl, int dl) {
+  assert(cl >= 0);
+  BN_ULONG borrow = bn_sub_words(r, a, b, cl);
+  if (dl == 0) {
+    return borrow;
+  }
+
+  r += cl;
+  a += cl;
+  b += cl;
+
+  if (dl < 0) {
+    // |a| is shorter than |b|. Complete the subtraction as if the excess words
+    // in |a| were zeros.
+    dl = -dl;
+    for (int i = 0; i < dl; i++) {
+      r[i] = CRYPTO_subc_w(0, b[i], borrow, &borrow);
+    }
+  } else {
+    // |b| is shorter than |a|. Complete the subtraction as if the excess words
+    // in |b| were zeros.
+    for (int i = 0; i < dl; i++) {
+      r[i] = CRYPTO_subc_w(a[i], 0, borrow, &borrow);
+    }
+  }
+
+  return borrow;
+}
+
+// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
+// and returning a mask of all ones if the result was negative and all zeros if
+// the result was positive. |cl| and |dl| follow the |bn_sub_part_words| calling
+// convention.
+//
+// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
+// is confusing.
+static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
+                                      const BN_ULONG *b, int cl, int dl,
+                                      BN_ULONG *tmp) {
+  BN_ULONG borrow = bn_sub_part_words(tmp, a, b, cl, dl);
+  bn_sub_part_words(r, b, a, cl, -dl);
+  int r_len = cl + (dl < 0 ? -dl : dl);
+  borrow = 0 - borrow;
+  bn_select_words(r, borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, r_len);
+  return borrow;
+}
+
+int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                         BN_CTX *ctx) {
+  int cl = a->width < b->width ? a->width : b->width;
+  int dl = a->width - b->width;
+  int r_len = a->width < b->width ? b->width : a->width;
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  int ok = tmp != NULL &&
+           bn_wexpand(r, r_len) &&
+           bn_wexpand(tmp, r_len);
+  if (ok) {
+    bn_abs_sub_part_words(r->d, a->d, b->d, cl, dl, tmp->d);
+    r->width = r_len;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}
+
+// Karatsuba recursive multiplication algorithm
+// (cf. Knuth, The Art of Computer Programming, Vol. 2)
+
+// bn_mul_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| has
+// length 2*|n2|, |a| has length |n2| + |dna|, |b| has length |n2| + |dnb|, and
+// |t| has length 4*|n2|. |n2| must be a power of two. Finally, we must have
+// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dna| <= 0 and
+// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dnb| <= 0.
+//
+// TODO(davidben): Simplify and |size_t| the calling convention around lengths
+// here.
+static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                             int n2, int dna, int dnb, BN_ULONG *t) {
+  // |n2| is a power of two.
+  assert(n2 != 0 && (n2 & (n2 - 1)) == 0);
+  // Check |dna| and |dnb| are in range.
+  assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dna && dna <= 0);
+  assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dnb && dnb <= 0);
+
+  // Only call bn_mul_comba 8 if n2 == 8 and the
+  // two arrays are complete [steve]
+  if (n2 == 8 && dna == 0 && dnb == 0) {
+    bn_mul_comba8(r, a, b);
+    return;
+  }
+
+  // Else do normal multiply
+  if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+    bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
+    if (dna + dnb < 0) {
+      OPENSSL_memset(&r[2 * n2 + dna + dnb], 0,
+                     sizeof(BN_ULONG) * -(dna + dnb));
+    }
+    return;
+  }
+
+  // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
+  // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
+  //
+  //   a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0
+  //
+  // Note that we know |n| >= |BN_MUL_RECURSIVE_SIZE_NORMAL|/2 above, so
+  // |tna| and |tnb| are non-negative.
+  int n = n2 / 2, tna = n + dna, tnb = n + dnb;
+
+  // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
+  // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
+  // themselves store the absolute value.
+  BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
+  neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
+
+  // Compute:
+  // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
+  // r0,r1 = a0 * b0
+  // r2,r3 = a1 * b1
+  if (n == 4 && dna == 0 && dnb == 0) {
+    bn_mul_comba4(&t[n2], t, &t[n]);
+
+    bn_mul_comba4(r, a, b);
+    bn_mul_comba4(&r[n2], &a[n], &b[n]);
+  } else if (n == 8 && dna == 0 && dnb == 0) {
+    bn_mul_comba8(&t[n2], t, &t[n]);
+
+    bn_mul_comba8(r, a, b);
+    bn_mul_comba8(&r[n2], &a[n], &b[n]);
+  } else {
+    BN_ULONG *p = &t[n2 * 2];
+    bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
+    bn_mul_recursive(r, a, b, n, 0, 0, p);
+    bn_mul_recursive(&r[n2], &a[n], &b[n], n, dna, dnb, p);
+  }
+
+  // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
+
+  // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
+  // The second term is stored as the absolute value, so we do this with a
+  // constant-time select.
+  BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
+  BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
+  bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  c = constant_time_select_w(neg, c_neg, c_pos);
+
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (int i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
+  }
+
+  // The product should fit without carries.
+  declassify_assert(c == 0);
+}
+
+// bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r|
+// has length 4*|n|, |a| has length |n| + |tna|, |b| has length |n| + |tnb|, and
+// |t| has length 8*|n|. |n| must be a power of two. Additionally, we must have
+// 0 <= tna < n and 0 <= tnb < n, and |tna| and |tnb| must differ by at most
+// one.
+//
+// TODO(davidben): Make this take |size_t| and perhaps the actual lengths of |a|
+// and |b|.
+static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a,
+                                  const BN_ULONG *b, int n, int tna, int tnb,
+                                  BN_ULONG *t) {
+  // |n| is a power of two.
+  assert(n != 0 && (n & (n - 1)) == 0);
+  // Check |tna| and |tnb| are in range.
+  assert(0 <= tna && tna < n);
+  assert(0 <= tnb && tnb < n);
+  assert(-1 <= tna - tnb && tna - tnb <= 1);
+
+  int n2 = n * 2;
+  if (n < 8) {
+    bn_mul_normal(r, a, n + tna, b, n + tnb);
+    OPENSSL_memset(r + n2 + tna + tnb, 0, n2 - tna - tnb);
+    return;
+  }
+
+  // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|. |a1|
+  // and |b1| have size |tna| and |tnb|, respectively.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
+  // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
+  //
+  //   a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0
+
+  // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
+  // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
+  // themselves store the absolute value.
+  BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
+  neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
+
+  // Compute:
+  // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
+  // r0,r1 = a0 * b0
+  // r2,r3 = a1 * b1
+  if (n == 8) {
+    bn_mul_comba8(&t[n2], t, &t[n]);
+    bn_mul_comba8(r, a, b);
+
+    bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
+    // |bn_mul_normal| only writes |tna| + |tna| words. Zero the rest.
+    OPENSSL_memset(&r[n2 + tna + tnb], 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+  } else {
+    BN_ULONG *p = &t[n2 * 2];
+    bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
+    bn_mul_recursive(r, a, b, n, 0, 0, p);
+
+    OPENSSL_memset(&r[n2], 0, sizeof(BN_ULONG) * n2);
+    if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
+        tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+      bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
+    } else {
+      int i = n;
+      for (;;) {
+        i /= 2;
+        if (i < tna || i < tnb) {
+          // E.g., n == 16, i == 8 and tna == 11. |tna| and |tnb| are within one
+          // of each other, so if |tna| is larger and tna > i, then we know
+          // tnb >= i, and this call is valid.
+          bn_mul_part_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
+          break;
+        }
+        if (i == tna || i == tnb) {
+          // If there is only a bottom half to the number, just do it. We know
+          // the larger of |tna - i| and |tnb - i| is zero. The other is zero or
+          // -1 by because of |tna| and |tnb| differ by at most one.
+          bn_mul_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
+          break;
+        }
+
+        // This loop will eventually terminate when |i| falls below
+        // |BN_MUL_RECURSIVE_SIZE_NORMAL| because we know one of |tna| and |tnb|
+        // exceeds that.
+      }
+    }
+  }
+
+  // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
+
+  // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
+  // The second term is stored as the absolute value, so we do this with a
+  // constant-time select.
+  BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
+  BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
+  bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  c = constant_time_select_w(neg, c_neg, c_pos);
+
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (int i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
+  }
+
+  // The product should fit without carries.
+  declassify_assert(c == 0);
+}
+
+// bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function
+// breaks |BIGNUM| invariants and may return a negative zero. This is handled by
+// the callers.
+static int bn_mul_impl(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                       BN_CTX *ctx) {
+  int al = a->width;
+  int bl = b->width;
+  if (al == 0 || bl == 0) {
+    BN_zero(r);
+    return 1;
+  }
+
+  int ret = 0;
+  BIGNUM *rr;
+  BN_CTX_start(ctx);
+  if (r == a || r == b) {
+    rr = BN_CTX_get(ctx);
+    if (rr == NULL) {
+      goto err;
+    }
+  } else {
+    rr = r;
+  }
+  rr->neg = a->neg ^ b->neg;
+
+  int i = al - bl;
+  if (i == 0) {
+    if (al == 8) {
+      if (!bn_wexpand(rr, 16)) {
+        goto err;
+      }
+      rr->width = 16;
+      bn_mul_comba8(rr->d, a->d, b->d);
+      goto end;
+    }
+  }
+
+  int top = al + bl;
+  static const int kMulNormalSize = 16;
+  if (al >= kMulNormalSize && bl >= kMulNormalSize) {
+    if (-1 <= i && i <= 1) {
+      // Find the largest power of two less than or equal to the larger length.
+      int j;
+      if (i >= 0) {
+        j = BN_num_bits_word((BN_ULONG)al);
+      } else {
+        j = BN_num_bits_word((BN_ULONG)bl);
+      }
+      j = 1 << (j - 1);
+      assert(j <= al || j <= bl);
+      BIGNUM *t = BN_CTX_get(ctx);
+      if (t == NULL) {
+        goto err;
+      }
+      if (al > j || bl > j) {
+        // We know |al| and |bl| are at most one from each other, so if al > j,
+        // bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|.
+        //
+        // TODO(davidben): This codepath is almost unused in standard
+        // algorithms. Is this optimization necessary? See notes in
+        // https://boringssl-review.googlesource.com/q/I0bd604e2cd6a75c266f64476c23a730ca1721ea6
+        assert(al >= j && bl >= j);
+        if (!bn_wexpand(t, j * 8) ||
+            !bn_wexpand(rr, j * 4)) {
+          goto err;
+        }
+        bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+      } else {
+        // al <= j && bl <= j. Additionally, we know j <= al or j <= bl, so one
+        // of al - j or bl - j is zero. The other, by the bound on |i| above, is
+        // zero or -1. Thus, we can use |bn_mul_recursive|.
+        if (!bn_wexpand(t, j * 4) ||
+            !bn_wexpand(rr, j * 2)) {
+          goto err;
+        }
+        bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+      }
+      rr->width = top;
+      goto end;
+    }
+  }
+
+  if (!bn_wexpand(rr, top)) {
+    goto err;
+  }
+  rr->width = top;
+  bn_mul_normal(rr->d, a->d, al, b->d, bl);
+
+end:
+  if (r != rr && !BN_copy(r, rr)) {
+    goto err;
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  if (!bn_mul_impl(r, a, b, ctx)) {
+    return 0;
+  }
+
+  // This additionally fixes any negative zeros created by |bn_mul_impl|.
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  // Prevent negative zeros.
+  if (a->neg || b->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  return bn_mul_impl(r, a, b, ctx);
+}
+
+void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
+                  const BN_ULONG *b, size_t num_b) {
+  if (num_r != num_a + num_b) {
+    abort();
+  }
+  // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not
+  // hit that code.
+  if (num_a == 8 && num_b == 8) {
+    bn_mul_comba8(r, a, b);
+  } else {
+    bn_mul_normal(r, a, num_a, b, num_b);
+  }
+}
+
+// tmp must have 2*n words
+static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n,
+                          BN_ULONG *tmp) {
+  if (n == 0) {
+    return;
+  }
+
+  size_t max = n * 2;
+  const BN_ULONG *ap = a;
+  BN_ULONG *rp = r;
+  rp[0] = rp[max - 1] = 0;
+  rp++;
+
+  // Compute the contribution of a[i] * a[j] for all i < j.
+  if (n > 1) {
+    ap++;
+    rp[n - 1] = bn_mul_words(rp, ap, n - 1, ap[-1]);
+    rp += 2;
+  }
+  if (n > 2) {
+    for (size_t i = n - 2; i > 0; i--) {
+      ap++;
+      rp[i] = bn_mul_add_words(rp, ap, i, ap[-1]);
+      rp += 2;
+    }
+  }
+
+  // The final result fits in |max| words, so none of the following operations
+  // will overflow.
+
+  // Double |r|, giving the contribution of a[i] * a[j] for all i != j.
+  bn_add_words(r, r, r, max);
+
+  // Add in the contribution of a[i] * a[i] for all i.
+  bn_sqr_words(tmp, a, n);
+  bn_add_words(r, r, tmp, max);
+}
+
+// bn_sqr_recursive sets |r| to |a|^2, using |t| as scratch space. |r| has
+// length 2*|n2|, |a| has length |n2|, and |t| has length 4*|n2|. |n2| must be
+// a power of two.
+static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, size_t n2,
+                             BN_ULONG *t) {
+  // |n2| is a power of two.
+  assert(n2 != 0 && (n2 & (n2 - 1)) == 0);
+
+  if (n2 == 4) {
+    bn_sqr_comba4(r, a);
+    return;
+  }
+  if (n2 == 8) {
+    bn_sqr_comba8(r, a);
+    return;
+  }
+  if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+    bn_sqr_normal(r, a, n2, t);
+    return;
+  }
+
+  // Split |a| into a0,a1, each of size |n|.
+  // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
+  // for recursive calls.
+  // Split |r| into r0,r1,r2,r3. We must contribute a0^2 to r0,r1, 2*a0*a1 to
+  // r1,r2, and a1^2 to r2,r3.
+  size_t n = n2 / 2;
+  BN_ULONG *t_recursive = &t[n2 * 2];
+
+  // t0 = |a0 - a1|.
+  bn_abs_sub_words(t, a, &a[n], n, &t[n]);
+  // t2,t3 = t0^2 = |a0 - a1|^2 = a0^2 - 2*a0*a1 + a1^2
+  bn_sqr_recursive(&t[n2], t, n, t_recursive);
+
+  // r0,r1 = a0^2
+  bn_sqr_recursive(r, a, n, t_recursive);
+
+  // r2,r3 = a1^2
+  bn_sqr_recursive(&r[n2], &a[n], n, t_recursive);
+
+  // t0,t1,c = r0,r1 + r2,r3 = a0^2 + a1^2
+  BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
+  // t2,t3,c = t0,t1,c - t2,t3 = 2*a0*a1
+  c -= bn_sub_words(&t[n2], t, &t[n2], n2);
+
+  // We now have our three components. Add them together.
+  // r1,r2,c = r1,r2 + t2,t3,c
+  c += bn_add_words(&r[n], &r[n], &t[n2], n2);
+
+  // Propagate the carry bit to the end.
+  for (size_t i = n + n2; i < n2 + n2; i++) {
+    BN_ULONG old = r[i];
+    r[i] = old + c;
+    c = r[i] < old;
+  }
+
+  // The square should fit without carries.
+  assert(c == 0);
+}
+
+int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
+  if (!bn->width) {
+    return 1;
+  }
+
+  if (w == 0) {
+    BN_zero(bn);
+    return 1;
+  }
+
+  BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->width, w);
+  if (ll) {
+    if (!bn_wexpand(bn, bn->width + 1)) {
+      return 0;
+    }
+    bn->d[bn->width++] = ll;
+  }
+
+  return 1;
+}
+
+int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+  int al = a->width;
+  if (al <= 0) {
+    r->width = 0;
+    r->neg = 0;
+    return 1;
+  }
+
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *rr = (a != r) ? r : BN_CTX_get(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (!rr || !tmp) {
+    goto err;
+  }
+
+  int max = 2 * al;  // Non-zero (from above)
+  if (!bn_wexpand(rr, max)) {
+    goto err;
+  }
+
+  if (al == 4) {
+    bn_sqr_comba4(rr->d, a->d);
+  } else if (al == 8) {
+    bn_sqr_comba8(rr->d, a->d);
+  } else {
+    if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+      BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
+      bn_sqr_normal(rr->d, a->d, al, t);
+    } else {
+      // If |al| is a power of two, we can use |bn_sqr_recursive|.
+      if (al != 0 && (al & (al - 1)) == 0) {
+        if (!bn_wexpand(tmp, al * 4)) {
+          goto err;
+        }
+        bn_sqr_recursive(rr->d, a->d, al, tmp->d);
+      } else {
+        if (!bn_wexpand(tmp, max)) {
+          goto err;
+        }
+        bn_sqr_normal(rr->d, a->d, al, tmp->d);
+      }
+    }
+  }
+
+  rr->neg = 0;
+  rr->width = max;
+
+  if (rr != r && !BN_copy(r, rr)) {
+    goto err;
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+  if (!bn_sqr_consttime(r, a, ctx)) {
+    return 0;
+  }
+
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
+  if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
+    abort();
+  }
+  if (num_a == 4) {
+    bn_sqr_comba4(r, a);
+  } else if (num_a == 8) {
+    bn_sqr_comba8(r, a);
+  } else {
+    BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
+    bn_sqr_normal(r, a, num_a, tmp);
+    OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG));
+  }
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/prime.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/prime.c
@@ -0,0 +1,988 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+// Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// kPrimes contains the first 1024 primes.
+static const uint16_t kPrimes[] = {
+    2,    3,    5,    7,    11,   13,   17,   19,   23,   29,   31,   37,
+    41,   43,   47,   53,   59,   61,   67,   71,   73,   79,   83,   89,
+    97,   101,  103,  107,  109,  113,  127,  131,  137,  139,  149,  151,
+    157,  163,  167,  173,  179,  181,  191,  193,  197,  199,  211,  223,
+    227,  229,  233,  239,  241,  251,  257,  263,  269,  271,  277,  281,
+    283,  293,  307,  311,  313,  317,  331,  337,  347,  349,  353,  359,
+    367,  373,  379,  383,  389,  397,  401,  409,  419,  421,  431,  433,
+    439,  443,  449,  457,  461,  463,  467,  479,  487,  491,  499,  503,
+    509,  521,  523,  541,  547,  557,  563,  569,  571,  577,  587,  593,
+    599,  601,  607,  613,  617,  619,  631,  641,  643,  647,  653,  659,
+    661,  673,  677,  683,  691,  701,  709,  719,  727,  733,  739,  743,
+    751,  757,  761,  769,  773,  787,  797,  809,  811,  821,  823,  827,
+    829,  839,  853,  857,  859,  863,  877,  881,  883,  887,  907,  911,
+    919,  929,  937,  941,  947,  953,  967,  971,  977,  983,  991,  997,
+    1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
+    1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
+    1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249,
+    1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
+    1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439,
+    1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
+    1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601,
+    1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
+    1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783,
+    1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
+    1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
+    1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
+    2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143,
+    2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
+    2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347,
+    2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
+    2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543,
+    2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
+    2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713,
+    2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
+    2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
+    2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
+    3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119,
+    3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
+    3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323,
+    3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
+    3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527,
+    3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
+    3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697,
+    3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
+    3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
+    3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
+    4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093,
+    4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
+    4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283,
+    4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
+    4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513,
+    4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
+    4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721,
+    4723, 4729, 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
+    4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
+    4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
+    5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101, 5107, 5113,
+    5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
+    5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351,
+    5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
+    5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531,
+    5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
+    5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743,
+    5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
+    5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
+    5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
+    6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173,
+    6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
+    6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359,
+    6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
+    6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577, 6581,
+    6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
+    6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779, 6781, 6791, 6793, 6803,
+    6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
+    6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
+    7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
+    7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229,
+    7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
+    7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487,
+    7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
+    7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643, 7649, 7669,
+    7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
+    7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879,
+    7883, 7901, 7907, 7919, 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
+    8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
+    8117, 8123, 8147, 8161,
+};
+
+// BN_prime_checks_for_size returns the number of Miller-Rabin iterations
+// necessary for generating a 'bits'-bit candidate prime.
+//
+//
+// This table is generated using the algorithm of FIPS PUB 186-4
+// Digital Signature Standard (DSS), section F.1, page 117.
+// (https://doi.org/10.6028/NIST.FIPS.186-4)
+// The following magma script was used to generate the output:
+// securitybits:=125;
+// k:=1024;
+// for t:=1 to 65 do
+//   for M:=3 to Floor(2*Sqrt(k-1)-1) do
+//     S:=0;
+//     // Sum over m
+//     for m:=3 to M do
+//       s:=0;
+//       // Sum over j
+//       for j:=2 to m do
+//         s+:=(RealField(32)!2)^-(j+(k-1)/j);
+//       end for;
+//       S+:=2^(m-(m-1)*t)*s;
+//     end for;
+//     A:=2^(k-2-M*t);
+//     B:=8*(Pi(RealField(32))^2-6)/3*2^(k-2)*S;
+//     pkt:=2.00743*Log(2)*k*2^-k*(A+B);
+//     seclevel:=Floor(-Log(2,pkt));
+//     if seclevel ge securitybits then
+//       printf "k: %5o, security: %o bits  (t: %o, M: %o)\n",k,seclevel,t,M;
+//       break;
+//     end if;
+//   end for;
+//   if seclevel ge securitybits then break; end if;
+// end for;
+//
+// It can be run online at: http://magma.maths.usyd.edu.au/calc
+// And will output:
+// k:  1024, security: 129 bits  (t: 6, M: 23)
+// k is the number of bits of the prime, securitybits is the level we want to
+// reach.
+// prime length | RSA key size | # MR tests | security level
+// -------------+--------------|------------+---------------
+//  (b) >= 6394 |     >= 12788 |          3 |        256 bit
+//  (b) >= 3747 |     >=  7494 |          3 |        192 bit
+//  (b) >= 1345 |     >=  2690 |          4 |        128 bit
+//  (b) >= 1080 |     >=  2160 |          5 |        128 bit
+//  (b) >=  852 |     >=  1704 |          5 |        112 bit
+//  (b) >=  476 |     >=   952 |          5 |         80 bit
+//  (b) >=  400 |     >=   800 |          6 |         80 bit
+//  (b) >=  347 |     >=   694 |          7 |         80 bit
+//  (b) >=  308 |     >=   616 |          8 |         80 bit
+//  (b) >=   55 |     >=   110 |         27 |         64 bit
+//  (b) >=    6 |     >=    12 |         34 |         64 bit
+static int BN_prime_checks_for_size(int bits) {
+  if (bits >= 3747) {
+    return 3;
+  }
+  if (bits >= 1345) {
+    return 4;
+  }
+  if (bits >= 476) {
+    return 5;
+  }
+  if (bits >= 400) {
+    return 6;
+  }
+  if (bits >= 347) {
+    return 7;
+  }
+  if (bits >= 308) {
+    return 8;
+  }
+  if (bits >= 55) {
+    return 27;
+  }
+  return 34;
+}
+
+// num_trial_division_primes returns the number of primes to try with trial
+// division before using more expensive checks. For larger numbers, the value
+// of excluding a candidate with trial division is larger.
+static size_t num_trial_division_primes(const BIGNUM *n) {
+  if (n->width * BN_BITS2 > 1024) {
+    return OPENSSL_ARRAY_SIZE(kPrimes);
+  }
+  return OPENSSL_ARRAY_SIZE(kPrimes) / 2;
+}
+
+// BN_PRIME_CHECKS_BLINDED is the iteration count for blinding the constant-time
+// primality test. See |BN_primality_test| for details. This number is selected
+// so that, for a candidate N-bit RSA prime, picking |BN_PRIME_CHECKS_BLINDED|
+// random N-bit numbers will have at least |BN_prime_checks_for_size(N)| values
+// in range with high probability.
+//
+// The following Python script computes the blinding factor needed for the
+// corresponding iteration count.
+/*
+import math
+
+# We choose candidate RSA primes between sqrt(2)/2 * 2^N and 2^N and select
+# witnesses by generating random N-bit numbers. Thus the probability of
+# selecting one in range is at least sqrt(2)/2.
+p = math.sqrt(2) / 2
+
+# Target around 2^-8 probability of the blinding being insufficient given that
+# key generation is a one-time, noisy operation.
+epsilon = 2**-8
+
+def choose(a, b):
+  r = 1
+  for i in xrange(b):
+    r *= a - i
+    r /= (i + 1)
+  return r
+
+def failure_rate(min_uniform, iterations):
+  """ Returns the probability that, for |iterations| candidate witnesses, fewer
+      than |min_uniform| of them will be uniform. """
+  prob = 0.0
+  for i in xrange(min_uniform):
+    prob += (choose(iterations, i) *
+             p**i * (1-p)**(iterations - i))
+  return prob
+
+for min_uniform in (3, 4, 5, 6, 8, 13, 19, 28):
+  # Find the smallest number of iterations under the target failure rate.
+  iterations = min_uniform
+  while True:
+    prob = failure_rate(min_uniform, iterations)
+    if prob < epsilon:
+      print min_uniform, iterations, prob
+      break
+    iterations += 1
+
+Output:
+  3 9 0.00368894873911
+  4 11 0.00363319494662
+  5 13 0.00336215573898
+  6 15 0.00300145783158
+  8 19 0.00225214119331
+  13 27 0.00385610026955
+  19 38 0.0021410539126
+  28 52 0.00325405801769
+
+16 iterations suffices for 400-bit primes and larger (6 uniform samples needed),
+which is already well below the minimum acceptable key size for RSA.
+*/
+#define BN_PRIME_CHECKS_BLINDED 16
+
+static int probable_prime(BIGNUM *rnd, int bits);
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+                             const BIGNUM *rem, BN_CTX *ctx);
+static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
+                                  const BIGNUM *rem, BN_CTX *ctx);
+
+BN_GENCB *BN_GENCB_new(void) { return OPENSSL_zalloc(sizeof(BN_GENCB)); }
+
+void BN_GENCB_free(BN_GENCB *callback) { OPENSSL_free(callback); }
+
+void BN_GENCB_set(BN_GENCB *callback,
+                  int (*f)(int event, int n, struct bn_gencb_st *),
+                  void *arg) {
+  callback->type = BN_GENCB_NEW_STYLE;
+  callback->callback.new_style = f;
+  callback->arg = arg;
+}
+
+void BN_GENCB_set_old(BN_GENCB *callback,
+                      void (*f)(int, int, void *), void *arg) {
+  callback->type = BN_GENCB_OLD_STYLE;
+  callback->callback.old_style = f;
+  callback->arg = arg;
+}
+
+int BN_GENCB_call(BN_GENCB *callback, int event, int n) {
+  if (!callback) {
+    return 1;
+  }
+
+  if (callback->type == BN_GENCB_NEW_STYLE) {
+    return callback->callback.new_style(event, n, callback);
+  } else if (callback->type == BN_GENCB_OLD_STYLE) {
+    callback->callback.old_style(event, n, callback);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+void *BN_GENCB_get_arg(const BN_GENCB *callback) { return callback->arg; }
+
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
+                         const BIGNUM *rem, BN_GENCB *cb) {
+  BIGNUM *t;
+  int found = 0;
+  int i, j, c1 = 0;
+  BN_CTX *ctx;
+  int checks = BN_prime_checks_for_size(bits);
+
+  if (bits < 2) {
+    // There are no prime numbers this small.
+    OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL);
+    return 0;
+  } else if (bits == 2 && safe) {
+    // The smallest safe prime (7) is three bits.
+    OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL);
+    return 0;
+  }
+
+  ctx = BN_CTX_new();
+  if (ctx == NULL) {
+    goto err;
+  }
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (!t) {
+    goto err;
+  }
+
+loop:
+  // make a random number and set the top and bottom bits
+  if (add == NULL) {
+    if (!probable_prime(ret, bits)) {
+      goto err;
+    }
+  } else {
+    if (safe) {
+      if (!probable_prime_dh_safe(ret, bits, add, rem, ctx)) {
+        goto err;
+      }
+    } else {
+      if (!probable_prime_dh(ret, bits, add, rem, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) {
+    // aborted
+    goto err;
+  }
+
+  if (!safe) {
+    i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
+    if (i == -1) {
+      goto err;
+    } else if (i == 0) {
+      goto loop;
+    }
+  } else {
+    // for "safe prime" generation, check that (p-1)/2 is prime. Since a prime
+    // is odd, We just need to divide by 2
+    if (!BN_rshift1(t, ret)) {
+      goto err;
+    }
+
+    // Interleave |ret| and |t|'s primality tests to avoid paying the full
+    // iteration count on |ret| only to quickly discover |t| is composite.
+    //
+    // TODO(davidben): This doesn't quite work because an iteration count of 1
+    // still runs the blinding mechanism.
+    for (i = 0; i < checks; i++) {
+      j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, NULL);
+      if (j == -1) {
+        goto err;
+      } else if (j == 0) {
+        goto loop;
+      }
+
+      j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, NULL);
+      if (j == -1) {
+        goto err;
+      } else if (j == 0) {
+        goto loop;
+      }
+
+      if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i)) {
+        goto err;
+      }
+      // We have a safe prime test pass
+    }
+  }
+
+  // we have a prime :-)
+  found = 1;
+
+err:
+  if (ctx != NULL) {
+    BN_CTX_end(ctx);
+    BN_CTX_free(ctx);
+  }
+
+  return found;
+}
+
+static int bn_trial_division(uint16_t *out, const BIGNUM *bn) {
+  const size_t num_primes = num_trial_division_primes(bn);
+  for (size_t i = 1; i < num_primes; i++) {
+    // During RSA key generation, |bn| may be secret, but only if |bn| was
+    // prime, so it is safe to leak failed trial divisions.
+    if (constant_time_declassify_int(bn_mod_u16_consttime(bn, kPrimes[i]) ==
+                                     0)) {
+      *out = kPrimes[i];
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int bn_odd_number_is_obviously_composite(const BIGNUM *bn) {
+  uint16_t prime;
+  return bn_trial_division(&prime, bn) && !BN_is_word(bn, prime);
+}
+
+int bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin, const BN_MONT_CTX *mont,
+                         BN_CTX *ctx) {
+  // This function corresponds to steps 1 through 3 of FIPS 186-4, C.3.1.
+  const BIGNUM *w = &mont->N;
+  // Note we do not call |BN_CTX_start| in this function. We intentionally
+  // allocate values in the containing scope so they outlive this function.
+  miller_rabin->w1 = BN_CTX_get(ctx);
+  miller_rabin->m = BN_CTX_get(ctx);
+  miller_rabin->one_mont = BN_CTX_get(ctx);
+  miller_rabin->w1_mont = BN_CTX_get(ctx);
+  if (miller_rabin->w1 == NULL ||
+      miller_rabin->m == NULL ||
+      miller_rabin->one_mont == NULL ||
+      miller_rabin->w1_mont == NULL) {
+    return 0;
+  }
+
+  // See FIPS 186-4, C.3.1, steps 1 through 3.
+  if (!bn_usub_consttime(miller_rabin->w1, w, BN_value_one())) {
+    return 0;
+  }
+  miller_rabin->a = BN_count_low_zero_bits(miller_rabin->w1);
+  if (!bn_rshift_secret_shift(miller_rabin->m, miller_rabin->w1,
+                              miller_rabin->a, ctx)) {
+    return 0;
+  }
+  miller_rabin->w_bits = BN_num_bits(w);
+
+  // Precompute some values in Montgomery form.
+  if (!bn_one_to_montgomery(miller_rabin->one_mont, mont, ctx) ||
+      // w - 1 is -1 mod w, so we can compute it in the Montgomery domain, -R,
+      // with a subtraction. (|one_mont| cannot be zero.)
+      !bn_usub_consttime(miller_rabin->w1_mont, w, miller_rabin->one_mont)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin,
+                              int *out_is_possibly_prime, const BIGNUM *b,
+                              const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  // This function corresponds to steps 4.3 through 4.5 of FIPS 186-4, C.3.1.
+  int ret = 0;
+  BN_CTX_start(ctx);
+
+  // Step 4.3. We use Montgomery-encoding for better performance and to avoid
+  // timing leaks.
+  const BIGNUM *w = &mont->N;
+  BIGNUM *z = BN_CTX_get(ctx);
+  if (z == NULL ||
+      !BN_mod_exp_mont_consttime(z, b, miller_rabin->m, w, ctx, mont) ||
+      !BN_to_montgomery(z, z, mont, ctx)) {
+    goto err;
+  }
+
+  // is_possibly_prime is all ones if we have determined |b| is not a composite
+  // witness for |w|. This is equivalent to going to step 4.7 in the original
+  // algorithm. To avoid timing leaks, we run the algorithm to the end for prime
+  // inputs.
+  crypto_word_t is_possibly_prime = 0;
+
+  // Step 4.4. If z = 1 or z = w-1, b is not a composite witness and w is still
+  // possibly prime.
+  is_possibly_prime = BN_equal_consttime(z, miller_rabin->one_mont) |
+                      BN_equal_consttime(z, miller_rabin->w1_mont);
+  is_possibly_prime = 0 - is_possibly_prime;  // Make it all zeros or all ones.
+
+  // Step 4.5.
+  //
+  // To avoid leaking |a|, we run the loop to |w_bits| and mask off all
+  // iterations once |j| = |a|.
+  for (int j = 1; j < miller_rabin->w_bits; j++) {
+    if (constant_time_declassify_w(constant_time_eq_int(j, miller_rabin->a) &
+                                   ~is_possibly_prime)) {
+      // If the loop is done and we haven't seen z = 1 or z = w-1 yet, the
+      // value is composite and we can break in variable time.
+      break;
+    }
+
+    // Step 4.5.1.
+    if (!BN_mod_mul_montgomery(z, z, z, mont, ctx)) {
+      goto err;
+    }
+
+    // Step 4.5.2. If z = w-1 and the loop is not done, this is not a composite
+    // witness.
+    crypto_word_t z_is_w1_mont = BN_equal_consttime(z, miller_rabin->w1_mont);
+    z_is_w1_mont = 0 - z_is_w1_mont;    // Make it all zeros or all ones.
+    is_possibly_prime |= z_is_w1_mont;  // Go to step 4.7 if |z_is_w1_mont|.
+
+    // Step 4.5.3. If z = 1 and the loop is not done, the previous value of z
+    // was not -1. There are no non-trivial square roots of 1 modulo a prime, so
+    // w is composite and we may exit in variable time.
+    if (constant_time_declassify_w(
+            BN_equal_consttime(z, miller_rabin->one_mont) &
+            ~is_possibly_prime)) {
+      break;
+    }
+  }
+
+  *out_is_possibly_prime = constant_time_declassify_w(is_possibly_prime) & 1;
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks,
+                      BN_CTX *ctx, int do_trial_division, BN_GENCB *cb) {
+  // This function's secrecy and performance requirements come from RSA key
+  // generation. We generate RSA keys by selecting two large, secret primes with
+  // rejection sampling.
+  //
+  // We thus treat |w| as secret if turns out to be a large prime. However, if
+  // |w| is composite, we treat this and |w| itself as public. (Conversely, if
+  // |w| is prime, that it is prime is public. Only the value is secret.) This
+  // is fine for RSA key generation, but note it is important that we use
+  // rejection sampling, with each candidate prime chosen independently. This
+  // would not work for, e.g., an algorithm which looked for primes in
+  // consecutive integers. These assumptions allow us to discard composites
+  // quickly. We additionally treat |w| as public when it is a small prime to
+  // simplify trial decryption and some edge cases.
+  //
+  // One RSA key generation will call this function on exactly two primes and
+  // many more composites. The overall cost is a combination of several factors:
+  //
+  // 1. Checking if |w| is divisible by a small prime is much faster than
+  //    learning it is composite by Miller-Rabin (see below for details on that
+  //    cost). Trial division by p saves 1/p of Miller-Rabin calls, so this is
+  //    worthwhile until p exceeds the ratio of the two costs.
+  //
+  // 2. For a random (i.e. non-adversarial) candidate large prime and candidate
+  //    witness, the probability of false witness is very low. (This is why FIPS
+  //    186-4 only requires a few iterations.) Thus composites not discarded by
+  //    trial decryption, in practice, cost one Miller-Rabin iteration. Only the
+  //    two actual primes cost the full iteration count.
+  //
+  // 3. A Miller-Rabin iteration is a modular exponentiation plus |a| additional
+  //    modular squares, where |a| is the number of factors of two in |w-1|. |a|
+  //    is likely small (the distribution falls exponentially), but it is also
+  //    potentially secret, so we loop up to its log(w) upper bound when |w| is
+  //    prime. When |w| is composite, we break early, so only two calls pay this
+  //    cost. (Note that all calls pay the modular exponentiation which is,
+  //    itself, log(w) modular multiplications and squares.)
+  //
+  // 4. While there are only two prime calls, they multiplicatively pay the full
+  //    costs of (2) and (3).
+  //
+  // 5. After the primes are chosen, RSA keys derive some values from the
+  //    primes, but this cost is negligible in comparison.
+
+  *out_is_probably_prime = 0;
+
+  if (BN_cmp(w, BN_value_one()) <= 0) {
+    return 1;
+  }
+
+  if (!BN_is_odd(w)) {
+    // The only even prime is two.
+    *out_is_probably_prime = BN_is_word(w, 2);
+    return 1;
+  }
+
+  // Miller-Rabin does not work for three.
+  if (BN_is_word(w, 3)) {
+    *out_is_probably_prime = 1;
+    return 1;
+  }
+
+  if (do_trial_division) {
+    // Perform additional trial division checks to discard small primes.
+    uint16_t prime;
+    if (bn_trial_division(&prime, w)) {
+      *out_is_probably_prime = BN_is_word(w, prime);
+      return 1;
+    }
+    if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, -1)) {
+      return 0;
+    }
+  }
+
+  if (checks == BN_prime_checks_for_generation) {
+    checks = BN_prime_checks_for_size(BN_num_bits(w));
+  }
+
+  BN_CTX *new_ctx = NULL;
+  if (ctx == NULL) {
+    new_ctx = BN_CTX_new();
+    if (new_ctx == NULL) {
+      return 0;
+    }
+    ctx = new_ctx;
+  }
+
+  // See C.3.1 from FIPS 186-4.
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *b = BN_CTX_get(ctx);
+  BN_MONT_CTX *mont = BN_MONT_CTX_new_consttime(w, ctx);
+  BN_MILLER_RABIN miller_rabin;
+  if (b == NULL || mont == NULL ||
+      // Steps 1-3.
+      !bn_miller_rabin_init(&miller_rabin, mont, ctx)) {
+    goto err;
+  }
+
+  // The following loop performs in inner iteration of the Miller-Rabin
+  // Primality test (Step 4).
+  //
+  // The algorithm as specified in FIPS 186-4 leaks information on |w|, the RSA
+  // private key. Instead, we run through each iteration unconditionally,
+  // performing modular multiplications, masking off any effects to behave
+  // equivalently to the specified algorithm.
+  //
+  // We also blind the number of values of |b| we try. Steps 4.1–4.2 say to
+  // discard out-of-range values. To avoid leaking information on |w|, we use
+  // |bn_rand_secret_range| which, rather than discarding bad values, adjusts
+  // them to be in range. Though not uniformly selected, these adjusted values
+  // are still usable as Miller-Rabin checks.
+  //
+  // Miller-Rabin is already probabilistic, so we could reach the desired
+  // confidence levels by just suitably increasing the iteration count. However,
+  // to align with FIPS 186-4, we use a more pessimal analysis: we do not count
+  // the non-uniform values towards the iteration count. As a result, this
+  // function is more complex and has more timing risk than necessary.
+  //
+  // We count both total iterations and uniform ones and iterate until we've
+  // reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively.
+  // If the latter is large enough, it will be the limiting factor with high
+  // probability and we won't leak information.
+  //
+  // Note this blinding does not impact most calls when picking primes because
+  // composites are rejected early. Only the two secret primes see extra work.
+
+  crypto_word_t uniform_iterations = 0;
+  // Using |constant_time_lt_w| seems to prevent the compiler from optimizing
+  // this into two jumps.
+  for (int i = 1; constant_time_declassify_w(
+           (i <= BN_PRIME_CHECKS_BLINDED) |
+           constant_time_lt_w(uniform_iterations, checks));
+       i++) {
+    // Step 4.1-4.2
+    int is_uniform;
+    if (!bn_rand_secret_range(b, &is_uniform, 2, miller_rabin.w1)) {
+        goto err;
+    }
+    uniform_iterations += is_uniform;
+
+    // Steps 4.3-4.5
+    int is_possibly_prime = 0;
+    if (!bn_miller_rabin_iteration(&miller_rabin, &is_possibly_prime, b, mont,
+                                   ctx)) {
+      goto err;
+    }
+
+    if (!is_possibly_prime) {
+      // Step 4.6. We did not see z = w-1 before z = 1, so w must be composite.
+      *out_is_probably_prime = 0;
+      ret = 1;
+      goto err;
+    }
+
+    // Step 4.7
+    if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) {
+      goto err;
+    }
+  }
+
+  declassify_assert(uniform_iterations >= (crypto_word_t)checks);
+  *out_is_probably_prime = 1;
+  ret = 1;
+
+err:
+  BN_MONT_CTX_free(mont);
+  BN_CTX_end(ctx);
+  BN_CTX_free(new_ctx);
+  return ret;
+}
+
+int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx,
+                   BN_GENCB *cb) {
+  return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb);
+}
+
+int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx,
+                            int do_trial_division, BN_GENCB *cb) {
+  int is_probably_prime;
+  if (!BN_primality_test(&is_probably_prime, a, checks, ctx, do_trial_division,
+                         cb)) {
+    return -1;
+  }
+  return is_probably_prime;
+}
+
+int BN_enhanced_miller_rabin_primality_test(
+    enum bn_primality_result_t *out_result, const BIGNUM *w, int checks,
+    BN_CTX *ctx, BN_GENCB *cb) {
+  // Enhanced Miller-Rabin is only valid on odd integers greater than 3.
+  if (!BN_is_odd(w) || BN_cmp_word(w, 3) <= 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INVALID_INPUT);
+    return 0;
+  }
+
+  if (checks == BN_prime_checks_for_generation) {
+    checks = BN_prime_checks_for_size(BN_num_bits(w));
+  }
+
+  int ret = 0;
+  BN_MONT_CTX *mont = NULL;
+
+  BN_CTX_start(ctx);
+
+  BIGNUM *w1 = BN_CTX_get(ctx);
+  if (w1 == NULL ||
+      !BN_copy(w1, w) ||
+      !BN_sub_word(w1, 1)) {
+    goto err;
+  }
+
+  // Write w1 as m*2^a (Steps 1 and 2).
+  int a = 0;
+  while (!BN_is_bit_set(w1, a)) {
+    a++;
+  }
+  BIGNUM *m = BN_CTX_get(ctx);
+  if (m == NULL ||
+      !BN_rshift(m, w1, a)) {
+    goto err;
+  }
+
+  BIGNUM *b = BN_CTX_get(ctx);
+  BIGNUM *g = BN_CTX_get(ctx);
+  BIGNUM *z = BN_CTX_get(ctx);
+  BIGNUM *x = BN_CTX_get(ctx);
+  BIGNUM *x1 = BN_CTX_get(ctx);
+  if (b == NULL ||
+      g == NULL ||
+      z == NULL ||
+      x == NULL ||
+      x1 == NULL) {
+    goto err;
+  }
+
+  // Montgomery setup for computations mod w
+  mont = BN_MONT_CTX_new_for_modulus(w, ctx);
+  if (mont == NULL) {
+    goto err;
+  }
+
+  // The following loop performs in inner iteration of the Enhanced Miller-Rabin
+  // Primality test (Step 4).
+  for (int i = 1; i <= checks; i++) {
+    // Step 4.1-4.2
+    if (!BN_rand_range_ex(b, 2, w1)) {
+      goto err;
+    }
+
+    // Step 4.3-4.4
+    if (!BN_gcd(g, b, w, ctx)) {
+      goto err;
+    }
+    if (BN_cmp_word(g, 1) > 0) {
+      *out_result = bn_composite;
+      ret = 1;
+      goto err;
+    }
+
+    // Step 4.5
+    if (!BN_mod_exp_mont(z, b, m, w, ctx, mont)) {
+      goto err;
+    }
+
+    // Step 4.6
+    if (BN_is_one(z) || BN_cmp(z, w1) == 0) {
+      goto loop;
+    }
+
+    // Step 4.7
+    for (int j = 1; j < a; j++) {
+      if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) {
+        goto err;
+      }
+      if (BN_cmp(z, w1) == 0) {
+        goto loop;
+      }
+      if (BN_is_one(z)) {
+        goto composite;
+      }
+    }
+
+    // Step 4.8-4.9
+    if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) {
+      goto err;
+    }
+
+    // Step 4.10-4.11
+    if (!BN_is_one(z) && !BN_copy(x, z)) {
+      goto err;
+    }
+
+ composite:
+    // Step 4.12-4.14
+    if (!BN_copy(x1, x) ||
+        !BN_sub_word(x1, 1) ||
+        !BN_gcd(g, x1, w, ctx)) {
+      goto err;
+    }
+    if (BN_cmp_word(g, 1) > 0) {
+      *out_result = bn_composite;
+    } else {
+      *out_result = bn_non_prime_power_composite;
+    }
+
+    ret = 1;
+    goto err;
+
+ loop:
+    // Step 4.15
+    if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) {
+      goto err;
+    }
+  }
+
+  *out_result = bn_probably_prime;
+  ret = 1;
+
+err:
+  BN_MONT_CTX_free(mont);
+  BN_CTX_end(ctx);
+
+  return ret;
+}
+
+static int probable_prime(BIGNUM *rnd, int bits) {
+  do {
+    if (!BN_rand(rnd, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD)) {
+      return 0;
+    }
+  } while (bn_odd_number_is_obviously_composite(rnd));
+  return 1;
+}
+
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+                             const BIGNUM *rem, BN_CTX *ctx) {
+  int ret = 0;
+  BIGNUM *t1;
+
+  BN_CTX_start(ctx);
+  if ((t1 = BN_CTX_get(ctx)) == NULL) {
+    goto err;
+  }
+
+  if (!BN_rand(rnd, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) {
+    goto err;
+  }
+
+  // we need ((rnd-rem) % add) == 0
+
+  if (!BN_mod(t1, rnd, add, ctx)) {
+    goto err;
+  }
+  if (!BN_sub(rnd, rnd, t1)) {
+    goto err;
+  }
+  if (rem == NULL) {
+    if (!BN_add_word(rnd, 1)) {
+      goto err;
+    }
+  } else {
+    if (!BN_add(rnd, rnd, rem)) {
+      goto err;
+    }
+  }
+  // we now have a random number 'rand' to test.
+
+  const size_t num_primes = num_trial_division_primes(rnd);
+loop:
+  for (size_t i = 1; i < num_primes; i++) {
+    // check that rnd is a prime
+    if (bn_mod_u16_consttime(rnd, kPrimes[i]) <= 1) {
+      if (!BN_add(rnd, rnd, add)) {
+        goto err;
+      }
+      goto loop;
+    }
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
+                                  const BIGNUM *rem, BN_CTX *ctx) {
+  int ret = 0;
+  BIGNUM *t1, *qadd, *q;
+
+  bits--;
+  BN_CTX_start(ctx);
+  t1 = BN_CTX_get(ctx);
+  q = BN_CTX_get(ctx);
+  qadd = BN_CTX_get(ctx);
+  if (qadd == NULL) {
+    goto err;
+  }
+
+  if (!BN_rshift1(qadd, padd)) {
+    goto err;
+  }
+
+  if (!BN_rand(q, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) {
+    goto err;
+  }
+
+  // we need ((rnd-rem) % add) == 0
+  if (!BN_mod(t1, q, qadd, ctx)) {
+    goto err;
+  }
+
+  if (!BN_sub(q, q, t1)) {
+    goto err;
+  }
+
+  if (rem == NULL) {
+    if (!BN_add_word(q, 1)) {
+      goto err;
+    }
+  } else {
+    if (!BN_rshift1(t1, rem)) {
+      goto err;
+    }
+    if (!BN_add(q, q, t1)) {
+      goto err;
+    }
+  }
+
+  // we now have a random number 'rand' to test.
+  if (!BN_lshift1(p, q)) {
+    goto err;
+  }
+  if (!BN_add_word(p, 1)) {
+    goto err;
+  }
+
+  const size_t num_primes = num_trial_division_primes(p);
+loop:
+  for (size_t i = 1; i < num_primes; i++) {
+    // check that p and q are prime
+    // check that for p and q
+    // gcd(p-1,primes) == 1 (except for 2)
+    if (bn_mod_u16_consttime(p, kPrimes[i]) == 0 ||
+        bn_mod_u16_consttime(q, kPrimes[i]) == 0) {
+      if (!BN_add(p, p, padd)) {
+        goto err;
+      }
+      if (!BN_add(q, q, qadd)) {
+        goto err;
+      }
+      goto loop;
+    }
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/random.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/random.c
@@ -0,0 +1,267 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+// Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/rand.h>
+#include <openssl/type_check.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+  if (rnd == NULL) {
+    return 0;
+  }
+
+  if (top != BN_RAND_TOP_ANY && top != BN_RAND_TOP_ONE &&
+      top != BN_RAND_TOP_TWO) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  if (bottom != BN_RAND_BOTTOM_ANY && bottom != BN_RAND_BOTTOM_ODD) {
+    OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  if (bits == 0) {
+    BN_zero(rnd);
+    return 1;
+  }
+
+  if (bits > INT_MAX - (BN_BITS2 - 1)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
+    return 0;
+  }
+
+  int words = (bits + BN_BITS2 - 1) / BN_BITS2;
+  int bit = (bits - 1) % BN_BITS2;
+  const BN_ULONG kOne = 1;
+  const BN_ULONG kThree = 3;
+  BN_ULONG mask = bit < BN_BITS2 - 1 ? (kOne << (bit + 1)) - 1 : BN_MASK2;
+  if (!bn_wexpand(rnd, words)) {
+    return 0;
+  }
+
+  // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
+  // functions to avoid updating the service indicator with the DRBG functions.
+  FIPS_service_indicator_lock_state();
+  AWSLC_ABORT_IF_NOT_ONE(RAND_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)));
+  FIPS_service_indicator_unlock_state();
+
+  rnd->d[words - 1] &= mask;
+  if (top != BN_RAND_TOP_ANY) {
+    if (top == BN_RAND_TOP_TWO && bits > 1) {
+      if (bit == 0) {
+        rnd->d[words - 1] |= 1;
+        rnd->d[words - 2] |= kOne << (BN_BITS2 - 1);
+      } else {
+        rnd->d[words - 1] |= kThree << (bit - 1);
+      }
+    } else {
+      rnd->d[words - 1] |= kOne << bit;
+    }
+  }
+  if (bottom == BN_RAND_BOTTOM_ODD) {
+    rnd->d[0] |= 1;
+  }
+
+  rnd->neg = 0;
+  rnd->width = words;
+  return 1;
+}
+
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+  return BN_rand(rnd, bits, top, bottom);
+}
+
+// bn_less_than_word_mask returns a mask of all ones if the number represented
+// by |len| words at |a| is less than |b| and zero otherwise. It performs this
+// computation in time independent of the value of |a|. |b| is assumed public.
+static crypto_word_t bn_less_than_word_mask(const BN_ULONG *a, size_t len,
+                                            BN_ULONG b) {
+  if (b == 0) {
+    return CONSTTIME_FALSE_W;
+  }
+  if (len == 0) {
+    return CONSTTIME_TRUE_W;
+  }
+
+  // |a| < |b| iff a[1..len-1] are all zero and a[0] < b.
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  crypto_word_t mask = 0;
+  for (size_t i = 1; i < len; i++) {
+    mask |= a[i];
+  }
+  // |mask| is now zero iff a[1..len-1] are all zero.
+  mask = constant_time_is_zero_w(mask);
+  mask &= constant_time_lt_w(a[0], b);
+  return mask;
+}
+
+int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive,
+                      const BN_ULONG *max_exclusive, size_t len) {
+  crypto_word_t mask = ~bn_less_than_word_mask(a, len, min_inclusive);
+  return mask & bn_less_than_words(a, max_exclusive, len);
+}
+
+static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask,
+                            size_t min_inclusive, const BN_ULONG *max_exclusive,
+                            size_t len) {
+  // The magnitude of |max_exclusive| is assumed public.
+  size_t words = len;
+  while (words > 0 && max_exclusive[words - 1] == 0) {
+    words--;
+  }
+  if (words == 0 ||
+      (words == 1 && max_exclusive[0] <= min_inclusive)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE);
+    return 0;
+  }
+  BN_ULONG mask = max_exclusive[words - 1];
+  // This sets all bits in |mask| below the most significant bit.
+  mask |= mask >> 1;
+  mask |= mask >> 2;
+  mask |= mask >> 4;
+  mask |= mask >> 8;
+  mask |= mask >> 16;
+#if defined(OPENSSL_64_BIT)
+  mask |= mask >> 32;
+#endif
+
+  *out_words = words;
+  *out_mask = mask;
+  return 1;
+}
+
+int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
+                        const BN_ULONG *max_exclusive, size_t len,
+                        const uint8_t additional_data[RAND_PRED_RESISTANCE_LEN]) {
+  // This function implements the equivalent of steps 4 through 7 of FIPS 186-4
+  // appendices B.4.2 and B.5.2. When called in those contexts, |max_exclusive|
+  // is n and |min_inclusive| is one.
+
+  // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
+  // functions to avoid updating the service indicator with the DRBG functions.
+  FIPS_service_indicator_lock_state();
+  int ret = 0;
+
+  // Compute the bit length of |max_exclusive| (step 1), in terms of a number of
+  // |words| worth of entropy to fill and a mask of bits to clear in the top
+  // word.
+  size_t words;
+  BN_ULONG mask;
+  if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive, len)) {
+    goto end;
+  }
+
+  // Fill any unused words with zero.
+  OPENSSL_memset(out + words, 0, (len - words) * sizeof(BN_ULONG));
+
+  unsigned count = 100;
+  do {
+    if (!--count) {
+      OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS);
+      goto end;
+    }
+
+    // Steps 4 and 5. Use |words| and |mask| together to obtain a string of N
+    // bits, where N is the bit length of |max_exclusive|.
+    RAND_bytes_with_user_prediction_resistance((uint8_t *)out, words * sizeof(BN_ULONG),
+                                    additional_data);
+    out[words - 1] &= mask;
+
+    // If out >= max_exclusive or out < min_inclusive, retry. This implements
+    // the equivalent of steps 6 and 7 without leaking the value of |out|. The
+    // result of this comparison may be treated as public. It only reveals how
+    // many attempts were needed before we found a value in range. This is
+    // independent of the final secret output, and has a distribution that
+    // depends only on |min_inclusive| and |max_exclusive|, both of which are
+    // public.
+  } while (!constant_time_declassify_int(
+      bn_in_range_words(out, min_inclusive, max_exclusive, words)));
+
+  ret = 1;
+
+end:
+  FIPS_service_indicator_unlock_state();
+  return ret;
+}
+
+int BN_rand_range_ex(BIGNUM *r, BN_ULONG min_inclusive,
+                     const BIGNUM *max_exclusive) {
+  static const uint8_t kDefaultAdditionalData[RAND_PRED_RESISTANCE_LEN] = {0};
+  if (!bn_wexpand(r, max_exclusive->width) ||
+      !bn_rand_range_words(r->d, min_inclusive, max_exclusive->d,
+                           max_exclusive->width, kDefaultAdditionalData)) {
+    return 0;
+  }
+
+  r->neg = 0;
+  r->width = max_exclusive->width;
+  return 1;
+}
+
+int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
+                         const BIGNUM *max_exclusive) {
+  // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
+  // functions to avoid updating the service indicator with the DRBG functions.
+  FIPS_service_indicator_lock_state();
+  int ret = 0;
+
+  size_t words;
+  BN_ULONG mask;
+  if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive->d,
+                        max_exclusive->width) ||
+      !bn_wexpand(r, words)) {
+    goto end;
+  }
+
+  assert(words > 0);
+  assert(mask != 0);
+  // The range must be large enough for bit tricks to fix invalid values.
+  if (words == 1 && min_inclusive > mask >> 1) {
+    OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE);
+    goto end;
+  }
+
+  // Select a uniform random number with num_bits(max_exclusive) bits.
+  AWSLC_ABORT_IF_NOT_ONE(RAND_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)));
+  r->d[words - 1] &= mask;
+
+  // Check, in constant-time, if the value is in range.
+  *out_is_uniform =
+      bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words);
+  crypto_word_t in_range = *out_is_uniform;
+  in_range = 0 - in_range;
+
+  // If the value is not in range, force it to be in range.
+  r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive);
+  r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1);
+  declassify_assert(
+      bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words));
+
+  r->neg = 0;
+  r->width = (int)words;
+  ret = 1;
+
+end:
+  FIPS_service_indicator_unlock_state();
+  return ret;
+}
+
+int BN_rand_range(BIGNUM *r, const BIGNUM *range) {
+  return BN_rand_range_ex(r, 0, range);
+}
+
+int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) {
+  return BN_rand_range(r, range);
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp.c
@@ -0,0 +1,130 @@
+// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2012, Intel Corporation. All Rights Reserved.
+//
+// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+// (1) Intel Corporation, Israel Development Center, Haifa, Israel
+// (2) University of Haifa, Israel
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "rsaz_exp.h"
+
+#if defined(RSAZ_ENABLED)
+
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// rsaz_one is 1 in RSAZ's representation.
+alignas(64) static const BN_ULONG rsaz_one[40] = {
+    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+// rsaz_two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is
+// 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22).
+alignas(64) static const BN_ULONG rsaz_two80[40] = {
+    0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0,       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+                            const BN_ULONG base_norm[16],
+                            const BN_ULONG exponent[16],
+                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
+                            BN_ULONG k0,
+                            BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) {
+  OPENSSL_STATIC_ASSERT(MOD_EXP_CTIME_ALIGN % 64 == 0,
+                        MOD_EXP_CTIME_ALIGN_is_too_small)
+  assert((uintptr_t)storage % 64 == 0);
+
+  BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s;
+  // Note |R2| aliases |table_s|.
+  if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
+    result = storage;
+    a_inv = storage + 40;
+    m = storage + 40 * 2;  // should not cross page
+  } else {
+    m = storage;  // should not cross page
+    result = storage + 40;
+    a_inv = storage + 40 * 2;
+  }
+
+  rsaz_1024_norm2red_avx2(m, m_norm);
+  rsaz_1024_norm2red_avx2(a_inv, base_norm);
+  rsaz_1024_norm2red_avx2(R2, RR);
+
+  // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix,
+  // giving R = 2^(36*29) = 2^1044.
+  rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+  // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052
+  rsaz_1024_mul_avx2(R2, R2, rsaz_two80, m, k0);
+  // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
+
+  // table[0] = 1
+  // table[1] = a_inv^1
+  rsaz_1024_mul_avx2(result, R2, rsaz_one, m, k0);
+  rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+  rsaz_1024_scatter5_avx2(table_s, result, 0);
+  rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
+  // table[2] = a_inv^2
+  rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 2);
+  // table[4] = a_inv^4
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 4);
+  // table[8] = a_inv^8
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 8);
+  // table[16] = a_inv^16
+  rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+  rsaz_1024_scatter5_avx2(table_s, result, 16);
+  for (int i = 3; i < 32; i += 2) {
+    // table[i] = table[i-1] * a_inv = a_inv^i
+    rsaz_1024_gather5_avx2(result, table_s, i - 1);
+    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+    rsaz_1024_scatter5_avx2(table_s, result, i);
+    for (int j = 2 * i; j < 32; j *= 2) {
+      // table[j] = table[j/2]^2 = a_inv^j
+      rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+      rsaz_1024_scatter5_avx2(table_s, result, j);
+    }
+  }
+
+  // Load the first window.
+  const uint8_t *p_str = (const uint8_t *)exponent;
+  int wvalue = p_str[127] >> 3;
+  rsaz_1024_gather5_avx2(result, table_s, wvalue);
+
+  int index = 1014;
+  while (index > -1) {  // Loop for the remaining 127 windows.
+    rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+
+    uint16_t wvalue_16;
+    memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16));
+    wvalue = wvalue_16;
+    wvalue = (wvalue >> (index % 8)) & 31;
+    index -= 5;
+
+    rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
+    rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+  }
+
+  // Square four times.
+  rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
+  wvalue = p_str[0] & 15;
+
+  rsaz_1024_gather5_avx2(a_inv, table_s, wvalue);  // Borrow |a_inv|.
+  rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
+  // Convert from Montgomery.
+  rsaz_1024_mul_avx2(result, result, rsaz_one, m, k0);
+
+  rsaz_1024_red2norm_avx2(result_norm, result);
+  BN_ULONG scratch[16];
+  bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, scratch, 16);
+
+  OPENSSL_cleanse(storage, MOD_EXP_CTIME_STORAGE_LEN * sizeof(BN_ULONG));
+}
+
+#endif  // RSAZ_ENABLED
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp.h
@@ -0,0 +1,324 @@
+// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2012, Intel Corporation. All Rights Reserved.
+//
+// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+// (1) Intel Corporation, Israel Development Center, Haifa, Israel
+// (2) University of Haifa, Israel
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
+#define OPENSSL_HEADER_BN_RSAZ_EXP_H
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+#include "../../internal.h"
+#include "../cpucap/internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+#define RSAZ_ENABLED
+
+
+// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
+// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
+// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
+// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
+// temporary buffer that must be aligned to |MOD_EXP_CTIME_ALIGN| bytes.
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
+                            const BN_ULONG exponent[16],
+                            const BN_ULONG m_norm[16], const BN_ULONG RR[16],
+                            BN_ULONG k0,
+                            BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
+
+OPENSSL_INLINE int rsaz_avx2_capable(void) {
+  return CRYPTO_is_AVX2_capable();
+}
+
+OPENSSL_INLINE int rsaz_avx2_preferred(void) {
+  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
+      CRYPTO_is_ADX_capable()) {
+    // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
+    // .Lmulx4x_enter and .Lpowerx5_enter branches.
+    return 0;
+  }
+  return CRYPTO_is_AVX2_capable();
+}
+
+
+// Assembly functions.
+
+// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in
+// 64-bit integers. This requires 36 limbs but padded up to 40.
+//
+// See crypto/bn/asm/rsaz-avx2.pl for further details.
+
+// rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation
+// and writes the result to |red|.
+void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);
+
+// rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|.
+// Inputs and outputs are in Montgomery form, using RSAZ's representation. |k|
+// is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
+void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
+                        const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);
+
+// rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to
+// |ret|. Inputs and outputs are in Montgomery form, using RSAZ's
+// representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
+void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
+                        const BN_ULONG n[40], BN_ULONG k, int count);
+
+// rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
+// positive and at most 31. It is treated as public. Note the table only uses 18
+// |BN_ULONG|s per entry instead of 40. It packs two 29-bit limbs into each
+// |BN_ULONG| and only stores 36 limbs rather than the padded 40.
+void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
+                             int i);
+
+// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|. |i|
+// must be positive and at most 31. It is treated as secret. |tbl| must be
+// aligned to 32 bytes.
+void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
+                            int i);
+
+// rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation
+// and writes the result to |norm|. The result will be <= the modulus.
+//
+// WARNING: The result of this operation may not be fully reduced. |norm| may be
+// the modulus instead of zero. This function should be followed by a call to
+// |bn_reduce_once|.
+void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);
+
+#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+#define RSAZ_512_ENABLED
+
+// Dual Montgomery modular exponentiation using prime moduli of the
+// same bit size, optimized with AVX512 ISA.
+//
+// Computes res|i| = base|i| ^ exp|i| mod m|i|.
+//
+// Input and output parameters for each exponentiation are independent and
+// denoted here by index |i|, i = 1..2.
+//
+// Input and output are all in regular 2^64 radix.
+//
+// Each moduli shall be |modlen| bit size.
+//
+// Supported cases:
+//   - 2x1024
+//   - 2x1536
+//   - 2x2048
+//
+//  [out] res|i|      - result of modular exponentiation: array of qword values
+//                      in regular (2^64) radix. Size of array shall be enough
+//                      to hold |modlen| bits.
+//  [in]  base|i|     - base
+//  [in]  exp|i|      - exponent
+//  [in]  m|i|        - moduli
+//  [in]  rr|i|       - Montgomery parameter RR = R^2 mod m|i|
+//  [in]  k0_|i|      - Montgomery parameter k0 = -1/m|i| mod 2^64
+//  [in]  modlen - moduli bit size
+//
+// \return 0 in case of failure,
+//         1 in case of success.
+//
+// NB: This function does not do any checks on its arguments, its
+// caller, `BN_mod_exp_mont_consttime_x2`, checks args. It should be
+// the function used directly.
+int RSAZ_mod_exp_avx512_x2(uint64_t *res1,
+                           const uint64_t *base1,
+                           const uint64_t *exponent1,
+                           const uint64_t *m1,
+                           const uint64_t *RR1,
+                           uint64_t k0_1,
+                           uint64_t *res2,
+                           const uint64_t *base2,
+                           const uint64_t *exponent2,
+                           const uint64_t *m2,
+                           const uint64_t *RR2,
+                           uint64_t k0_2,
+                           int modlen);
+
+// Naming convention for the following functions:
+//
+//   * amm: Almost Montgomery Multiplication
+//   * ams: Almost Montgomery Squaring
+//   * 52xZZ: data represented as array of ZZ digits in 52-bit radix
+//   * _x1_/_x2_:  1 or 2 independent inputs/outputs
+//   * ifma256: uses 256-bit wide IFMA ISA (AVX512_IFMA256)
+//
+//
+// Almost Montgomery Multiplication (AMM) for 20-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high
+// bits zeroed.  |k0| is a Montgomery coefficient, which is here k0 =
+// -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 1040 > 1024 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void rsaz_amm52x20_x1_ifma256(uint64_t *res, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              uint64_t k0);
+
+// Dual Almost Montgomery Multiplication for 20-digit number in radix
+// 2^52
+//
+// See description of rsaz_amm52x20_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+void rsaz_amm52x20_x2_ifma256(uint64_t *out, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              const uint64_t k0[2]);
+
+// Constant time extraction from the precomputed table of powers
+// base^i, where i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent
+// base values and two independent moduli. The precomputed powers of
+// the base values are stored contiguously in the table.
+//
+// Extracted value (output) is 2 20 digit numbers in 2^52 radix.
+//
+// EXP_WIN_SIZE = 5
+void extract_multiplier_2x20_win5(uint64_t *red_Y,
+                                  const uint64_t *red_table,
+                                  int red_table_idx1, int red_table_idx2);
+
+// Almost Montgomery Multiplication (AMM) for 30-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high
+// bits zeroed
+//
+// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
+//
+// |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 1560 > 1536 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void rsaz_amm52x30_x1_ifma256(uint64_t *res, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              uint64_t k0);
+// Dual Almost Montgomery Multiplication for 30-digit number in radix
+// 2^52
+//
+// See description of rsaz_amm52x30_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+//
+// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
+void rsaz_amm52x30_x2_ifma256(uint64_t *out, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              const uint64_t k0[2]);
+
+// Constant time extraction from the precomputed table of powers
+// base^i, where i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent
+// base values.  |red_table_idx1| and |red_table_idx2| are
+// corresponding power indexes.
+//
+// Extracted value (output) is 2 (30 + 2) digits numbers in 2^52
+// radix.  (2 high QW is zero padding)
+//
+// EXP_WIN_SIZE = 5
+void extract_multiplier_2x30_win5(uint64_t *red_Y,
+                                  const uint64_t *red_table,
+                                  int red_table_idx1, int red_table_idx2);
+
+// Almost Montgomery Multiplication (AMM) for 40-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high
+// bits zeroed.  |k0| is a Montgomery coefficient, which is here k0 =
+// -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 2080 > 2048 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void rsaz_amm52x40_x1_ifma256(uint64_t *res, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              uint64_t k0);
+
+// Dual Almost Montgomery Multiplication for 40-digit number in radix
+// 2^52
+//
+// See description of rsaz_amm52x40_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+void rsaz_amm52x40_x2_ifma256(uint64_t *out, const uint64_t *a,
+                              const uint64_t *b, const uint64_t *m,
+                              const uint64_t k0[2]);
+
+// Constant time extraction from the precomputed table of powers base^i, where
+//    i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent base values.
+// |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
+//
+// Extracted value (output) is 2 40 digits numbers in 2^52 radix.
+//
+// EXP_WIN_SIZE = 5
+void extract_multiplier_2x40_win5(uint64_t *red_Y,
+                                  const uint64_t *red_table,
+                                  int red_table_idx1, int red_table_idx2);
+#endif // !MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+
+#endif  // !OPENSSL_NO_ASM && OPENSSL_X86_64
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif
+
+#endif  // OPENSSL_HEADER_BN_RSAZ_EXP_H
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp_x2.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/rsaz_exp_x2.c
@@ -0,0 +1,616 @@
+// Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright (c) 2020-2021, Intel Corporation. All Rights Reserved.
+//
+// Originally written by Sergey Kirillov and Andrey Matyukov. Special
+// thanks to Ilya Albrekht for his valuable hints.
+//
+// Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifdef RSAZ_512_ENABLED
+
+#include <openssl/crypto.h>
+#include <assert.h>
+#include "../../internal.h"
+#include "rsaz_exp.h"
+
+//  Internal radix
+# define DIGIT_SIZE (52)
+// 52-bit mask
+# define DIGIT_MASK ((uint64_t)0xFFFFFFFFFFFFF)
+
+# define BITS2WORD8_SIZE(x)  (((x) + 7) >> 3)
+# define BITS2WORD64_SIZE(x) (((x) + 63) >> 6)
+
+// Number of registers required to hold |digits_num| amount of qword
+// digits
+# define NUMBER_OF_REGISTERS(digits_num, register_size)            \
+    (((digits_num) * 64 + (register_size) - 1) / (register_size))
+
+OPENSSL_INLINE uint64_t get_digit(const uint8_t *in, int in_len);
+OPENSSL_INLINE void put_digit(uint8_t *out, int out_len, uint64_t digit);
+static void to_words52(uint64_t *out, int out_len, const uint64_t *in,
+                       int in_bitsize);
+static void from_words52(uint64_t *bn_out, int out_bitsize, const uint64_t *in);
+OPENSSL_INLINE void set_bit(uint64_t *a, int idx);
+
+// Number of |digit_size|-bit digits in |bitsize|-bit value
+OPENSSL_INLINE int number_of_digits(int bitsize, int digit_size)
+{
+    return (bitsize + digit_size - 1) / digit_size;
+}
+
+
+// Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of
+// the same bit size using Almost Montgomery Multiplication, optimized with
+// AVX512_IFMA256 ISA.
+//
+// The parameter w (window size) = 5.
+//
+//  [out] res      - result of modular exponentiation: 2x{20,30,40} qword
+//                   values in 2^52 radix.
+//  [in]  base     - base (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  exp      - array of 2 pointers to {16,24,32} qword values in 2^64 radix.
+//                   Exponent is not converted to redundant representation.
+//  [in]  m        - moduli (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  rr       - Montgomery parameter for 2 moduli:
+//                     RR(1024) = 2^2080 mod m.
+//                     RR(1536) = 2^3120 mod m.
+//                     RR(2048) = 2^4160 mod m.
+//                   (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  k0       - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64
+//
+// \return (void).
+static int rsaz_mod_exp_x2_ifma256(uint64_t *res, const uint64_t *base,
+                                   const uint64_t *exp[2], const uint64_t *m,
+                                   const uint64_t *rr, const uint64_t k0[2],
+                                   int modlen);
+
+// NB: This function does not do any checks on its arguments, its
+// caller `BN_mod_exp_mont_consttime_x2`, checks args. It should be
+// the function used directly.
+int RSAZ_mod_exp_avx512_x2(uint64_t *res1,
+                           const uint64_t *base1,
+                           const uint64_t *exp1,
+                           const uint64_t *m1,
+                           const uint64_t *rr1,
+                           uint64_t k0_1,
+                           uint64_t *res2,
+                           const uint64_t *base2,
+                           const uint64_t *exp2,
+                           const uint64_t *m2,
+                           const uint64_t *rr2,
+                           uint64_t k0_2,
+                           int modlen)
+{
+#ifdef BORINGSSL_DISPATCH_TEST
+    BORINGSSL_function_hit[8] = 1;
+#endif
+    typedef void (*AMM)(uint64_t *res, const uint64_t *a,
+                        const uint64_t *b, const uint64_t *m, uint64_t k0);
+    int ret = 0;
+
+    // Number of word-size (uint64_t) digits to store values in
+    // redundant representation.
+    int red_digits = number_of_digits(modlen + 2, DIGIT_SIZE);
+
+    // n = modlen, d = DIGIT_SIZE, s = d * ceil((n+2)/d) > n
+    // k = 4 * (s - n) = bitlen_diff
+    //
+    // Given the Montgomery domain conversion value RR = R^2 mod m[i]
+    // = 2^2n mod m[i] and that for the larger representation in s
+    // bits, RR' = R'^2 mod m[i] = 2^2s mod m[i], bitlen_diff is
+    // needed to convert from RR to RR' as explained below in its
+    // calculation.
+    int bitlen_diff = 4 * (DIGIT_SIZE * red_digits - modlen);
+
+    // Number of YMM registers required to store a value
+    int num_ymm_regs = NUMBER_OF_REGISTERS(red_digits, 256);
+    // Capacity of the register set (in qwords = 64-bits) to store a
+    // value
+    int regs_capacity = num_ymm_regs * 4;
+
+    // The following 7 values are in redundant representation and are
+    // to be stored contiguously in storage_aligned as needed by the
+    // function rsaz_mod_exp_x2_ifma256.
+    uint64_t *base1_red, *m1_red, *rr1_red;
+    uint64_t *base2_red, *m2_red, *rr2_red;
+    uint64_t *coeff_red;
+
+    uint64_t *storage = NULL;
+    uint64_t *storage_aligned = NULL;
+    int storage_len_bytes = 7 * regs_capacity * sizeof(uint64_t)
+      + 64; // alignment
+
+    const uint64_t *exp[2] = {0};
+    uint64_t k0[2] = {0};
+    // AMM = Almost Montgomery Multiplication
+    AMM amm = NULL;
+
+    switch (modlen) {
+    case 1024:
+        amm = rsaz_amm52x20_x1_ifma256;
+        break;
+    case 1536:
+        amm = rsaz_amm52x30_x1_ifma256;
+        break;
+    case 2048:
+        amm = rsaz_amm52x40_x1_ifma256;
+        break;
+    default:
+        goto err;
+    }
+
+    storage = (uint64_t *)OPENSSL_malloc(storage_len_bytes);
+    if (storage == NULL)
+        goto err;
+    storage_aligned = (uint64_t *)align_pointer(storage, 64);
+
+    // Memory layout for red(undant) representations
+    base1_red = storage_aligned;
+    base2_red = storage_aligned + 1 * regs_capacity;
+    m1_red    = storage_aligned + 2 * regs_capacity;
+    m2_red    = storage_aligned + 3 * regs_capacity;
+    rr1_red   = storage_aligned + 4 * regs_capacity;
+    rr2_red   = storage_aligned + 5 * regs_capacity;
+    coeff_red = storage_aligned + 6 * regs_capacity;
+
+    // Convert base_i, m_i, rr_i, from regular to 52-bit radix
+    to_words52(base1_red, regs_capacity, base1, modlen);
+    to_words52(base2_red, regs_capacity, base2, modlen);
+    to_words52(m1_red,    regs_capacity, m1,    modlen);
+    to_words52(m2_red,    regs_capacity, m2,    modlen);
+    to_words52(rr1_red,   regs_capacity, rr1,   modlen);
+    to_words52(rr2_red,   regs_capacity, rr2,   modlen);
+
+    // Based on the definition of n and s above, we have
+    //   R  = 2^n mod m; RR  = R^2 mod m
+    //   R' = 2^s mod m; RR' = R'^2 mod m
+    // To obtain R'^2 from R^2:
+    //   - Let t = AMM(RR, RR) = R^4 / R' mod m                 -- (1)
+    //   - Note that R'4 = R^4 * 2^{4*(s-n)} mod m
+    //   - Let k = 4 * (s - n)
+    //   - We have AMM(t, 2^k) = R^4 * 2^{4*(s-n)} / R'^2 mod m -- (2)
+    //                         = R'^4 / R'^2 mod m
+    //                         = R'^2 mod m
+    // For example, for n = 1024, s = 1040, k = 64,
+    //                 RR = 2^2048 mod m, RR' = 2^2080 mod m
+    
+    OPENSSL_memset(coeff_red, 0, red_digits * sizeof(uint64_t));
+    // coeff_red = 2^k = 1 << bitlen_diff taking into account the
+    // redundant representation in digits of DIGIT_SIZE bits
+    set_bit(coeff_red, 64 * (int)(bitlen_diff / DIGIT_SIZE) + bitlen_diff % DIGIT_SIZE);
+
+    amm(rr1_red, rr1_red, rr1_red, m1_red, k0_1); // (1) for m1
+    amm(rr1_red, rr1_red, coeff_red, m1_red, k0_1); // (2) for m1
+
+    amm(rr2_red, rr2_red, rr2_red, m2_red, k0_2); // (1) for m2
+    amm(rr2_red, rr2_red, coeff_red, m2_red, k0_2); // (2) for m2
+
+    exp[0] = exp1;
+    exp[1] = exp2;
+
+    k0[0] = k0_1;
+    k0[1] = k0_2;
+
+    // Compute res|i| = base|i| ^ exp|i| mod m|i| in parallel in
+    // their contiguous form.
+    ret = rsaz_mod_exp_x2_ifma256(rr1_red, base1_red, exp, m1_red, rr1_red,
+                                  k0, modlen);
+    if (!ret)
+        goto err;
+
+    // Convert rr_i back to regular radix
+    from_words52(res1, modlen, rr1_red);
+    from_words52(res2, modlen, rr2_red);
+
+    // bn_reduce_once_in_place expects number of uint64_t, not bit
+    // size
+    modlen /= sizeof(uint64_t) * 8;
+
+    bn_reduce_once_in_place(res1, 0, m1, storage, modlen);
+    bn_reduce_once_in_place(res2, 0, m2, storage, modlen);
+
+err:
+    if (storage != NULL) {
+        OPENSSL_cleanse(storage, storage_len_bytes);
+        OPENSSL_free(storage);
+    }
+    return ret;
+}
+
+int rsaz_mod_exp_x2_ifma256(uint64_t *out,
+                            const uint64_t *base,
+                            const uint64_t *exp[2],
+                            const uint64_t *m,
+                            const uint64_t *rr,
+                            const uint64_t k0[2],
+                            int modlen)
+{
+
+    typedef void (*DAMM)(uint64_t *res, const uint64_t *a,
+                         const uint64_t *b, const uint64_t *m,
+                         const uint64_t k0[2]);
+    typedef void (*DEXTRACT)(uint64_t *res, const uint64_t *red_table,
+                             int red_table_idx, int tbl_idx);
+
+    int ret = 0;
+    int idx;
+
+    // Exponent window size
+    int exp_win_size = 5;
+    int two_to_exp_win_size = 1U << exp_win_size;
+    int exp_win_mask = two_to_exp_win_size - 1;
+
+    // Number of digits (64-bit words) in redundant representation to
+    // handle modulus bits
+    int red_digits = 0;
+    // Number of digits (64-bit words) to store the two exponents,
+    // found in `exp`.
+    int exp_digits = 0;
+
+    uint64_t *storage = NULL;
+    uint64_t *storage_aligned = NULL;
+    int storage_len_bytes = 0;
+
+    // Red(undant) result Y and multiplier X
+    uint64_t *red_Y = NULL;     // [2][red_digits]
+    uint64_t *red_X = NULL;     // [2][red_digits]
+    /* Pre-computed table of base powers */
+    uint64_t *red_table = NULL; // [two_to_exp_win_size][2][red_digits]
+    // Expanded exponent
+    uint64_t *expz = NULL;      // [2][exp_digits + 1]
+
+    // Dual AMM
+    DAMM damm = NULL;
+    // Extractor from red_table
+    DEXTRACT extract = NULL;
+
+// Squaring is done using multiplication now. That can be a subject of
+// optimization in future.
+# define DAMS(r,a,m,k0) damm((r),(a),(a),(m),(k0))
+
+    switch (modlen) {
+    case 1024:
+        red_digits = 20;
+        exp_digits = 16;
+        damm = rsaz_amm52x20_x2_ifma256;
+        extract = extract_multiplier_2x20_win5;
+        break;
+    case 1536:
+        // Extended with 2 digits padding to avoid mask ops in high YMM register 
+        red_digits = 30 + 2;
+        exp_digits = 24;
+        damm = rsaz_amm52x30_x2_ifma256;
+        extract = extract_multiplier_2x30_win5;
+        break;
+    case 2048:
+        red_digits = 40;
+        exp_digits = 32;
+        damm = rsaz_amm52x40_x2_ifma256;
+        extract = extract_multiplier_2x40_win5;
+        break;
+    default:
+        goto err;
+    }
+
+    // allocate space for 2x num digits, aligned because the data in
+    // the vectors need to be 64-bit aligned.
+    storage_len_bytes = (2 * red_digits                       // red_Y
+                       + 2 * red_digits                       // red_X
+                       + 2 * red_digits * two_to_exp_win_size // red_table
+                       + 2 * (exp_digits + 1))                // expz
+                       * sizeof(uint64_t)
+                       + 64;                                  // alignment
+
+    storage = (uint64_t *)OPENSSL_malloc(storage_len_bytes);
+    if (storage == NULL)
+        goto err;
+    OPENSSL_cleanse(storage, storage_len_bytes);
+    storage_aligned = (uint64_t *)align_pointer(storage, 64);
+
+    red_Y     = storage_aligned;
+    red_X     = red_Y + 2 * red_digits;
+    red_table = red_X + 2 * red_digits;
+    expz      = red_table + 2 * red_digits * two_to_exp_win_size;
+
+    // Compute table of powers base^i mod m,
+    // i = 0, ..., (2^EXP_WIN_SIZE) - 1
+    // using the dual multiplication. Each table entry contains
+    // base1^i mod m1, then base2^i mod m2.
+
+    red_X[0 * red_digits] = 1;
+    red_X[1 * red_digits] = 1;
+    damm(&red_table[0 * 2 * red_digits], (const uint64_t*)red_X, rr, m, k0);
+    damm(&red_table[1 * 2 * red_digits], base,  rr, m, k0);
+
+    for (idx = 1; idx < (int)(two_to_exp_win_size / 2); idx++) {
+        DAMS(&red_table[(2 * idx + 0) * 2 * red_digits],
+             &red_table[(1 * idx)     * 2 * red_digits], m, k0);
+        damm(&red_table[(2 * idx + 1) * 2 * red_digits],
+             &red_table[(2 * idx)     * 2 * red_digits],
+             &red_table[1 * 2 * red_digits], m, k0);
+    }
+
+    // Copy and expand exponents
+    memcpy(&expz[0 * (exp_digits + 1)], exp[0], exp_digits * sizeof(uint64_t));
+    expz[1 * (exp_digits + 1) - 1] = 0;
+    memcpy(&expz[1 * (exp_digits + 1)], exp[1], exp_digits * sizeof(uint64_t));
+    expz[2 * (exp_digits + 1) - 1] = 0;
+
+
+    // Exponentiation
+    //
+    // This is Algorithm 3 in iacr 2011-239 which is cited below as
+    // well.
+    //
+    // Rather than compute base^{exp} in one shot, the powers of
+    // base^i for i = [0..2^{exp_win_size}) are precomputed and stored
+    // in `red_table`. Each window of the exponent is then used as an
+    // index to look up the power in the table, and then that result
+    // goes through a "series of squaring", which repositions it with
+    // respect to where it appears in the complete exponent. That
+    // result is then multiplied by the previous result.
+    //
+    // The `extract` routine does the lookup, `DAMS` wraps the `damm`
+    // routine to set up squaring, while `damm` is the AMM
+    // routine. That is what you find happening in each iteration of
+    // this loop—the stepping through the exponent one
+    // `win_exp_size`-bit window at a time.
+    {
+        const int rem = modlen % exp_win_size;
+        const uint64_t table_idx_mask = exp_win_mask;
+
+        int exp_bit_no = modlen - rem;
+        int exp_chunk_no = exp_bit_no / 64;
+        int exp_chunk_shift = exp_bit_no % 64;
+
+        uint64_t red_table_idx_1, red_table_idx_2;
+
+	// `rem` is { 1024, 1536, 2048 } % 5 which is { 4, 1, 3 }
+        // respectively.
+        //
+        // If this assertion ever fails then we should set this easy
+        // fix exp_bit_no = modlen - exp_win_size
+        assert(rem == 4 || rem == 1 || rem == 3);
+
+
+        // Find the location of the 5-bit window in the exponent which
+        // is stored in 64-bit digits. Left pad it with 0s to form a
+        // 64-bit digit to become an index in the precomputed table.
+        // The window location in the exponent is identified by its
+        // least significant bit `exp_bit_no`.
+
+#define EXP_CHUNK(i) (exp_chunk_no) + ((i) * (exp_digits + 1))
+#define EXP_CHUNK1(i) (exp_chunk_no) + 1 + ((i) * (exp_digits + 1))
+
+        // Process 1-st exp window - just init result
+	red_table_idx_1 = expz[EXP_CHUNK(0)];
+        red_table_idx_2 = expz[EXP_CHUNK(1)];
+
+	// The function operates with fixed moduli sizes divisible by
+	// 64, thus table index here is always in supported range [0,
+	// EXP_WIN_SIZE).
+        red_table_idx_1 >>= exp_chunk_shift;
+        red_table_idx_2 >>= exp_chunk_shift;
+
+        extract(&red_Y[0 * red_digits], (const uint64_t*)red_table,
+		(int)red_table_idx_1, (int)red_table_idx_2);
+
+        // Process other exp windows
+        for (exp_bit_no -= exp_win_size; exp_bit_no >= 0; exp_bit_no -= exp_win_size) {
+            // Extract pre-computed multiplier from the table
+            {
+                uint64_t T;
+
+                exp_chunk_no = exp_bit_no / 64;
+                exp_chunk_shift = exp_bit_no % 64;
+                {
+		  red_table_idx_1 = expz[EXP_CHUNK(0)];
+                    T = expz[EXP_CHUNK1(0)];
+
+                    red_table_idx_1 >>= exp_chunk_shift;
+		    // Get additional bits from then next quadword
+		    // when 64-bit boundaries are crossed.
+                    if (exp_chunk_shift > 64 - exp_win_size) {
+                        T <<= (64 - exp_chunk_shift);
+                        red_table_idx_1 ^= T;
+                    }
+                    red_table_idx_1 &= table_idx_mask;
+                }
+                {
+		  red_table_idx_2 = expz[EXP_CHUNK(1)];
+                    T = expz[EXP_CHUNK1(1)];
+
+                    red_table_idx_2 >>= exp_chunk_shift;
+		    // Get additional bits from then next quadword
+		    // when 64-bit boundaries are crossed.
+                    if (exp_chunk_shift > 64 - exp_win_size) {
+                        T <<= (64 - exp_chunk_shift);
+                        red_table_idx_2 ^= T;
+                    }
+                    red_table_idx_2 &= table_idx_mask;
+                }
+
+                extract(&red_X[0 * red_digits], (const uint64_t*)red_table,
+			(int)red_table_idx_1, (int)red_table_idx_2);
+            }
+
+            // The number of squarings is equal to the window size.
+            DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
+            DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
+            DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
+            DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
+            DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
+
+            damm((uint64_t*)red_Y, (const uint64_t*)red_Y, (const uint64_t*)red_X, m, k0);
+        }
+    }
+
+    // NB: After the last AMM of exponentiation in Montgomery domain, the result
+    // may be (modlen + 1), but the conversion out of Montgomery domain
+    // performs an AMM(x,1) which guarantees that the final result is less than
+    // |m|, so no conditional subtraction is needed here. See [1] for details.
+    //
+    // [1] Gueron, S. Efficient software implementations of modular exponentiation.
+    //     DOI: 10.1007/s13389-012-0031-5
+
+    // Convert exponentiation result out of Montgomery form but still
+    // in the redundant DIGIT_SIZE-bit representation.
+    memset(red_X, 0, 2 * red_digits * sizeof(uint64_t));
+    red_X[0 * red_digits] = 1;
+    red_X[1 * red_digits] = 1;
+    damm(out, (const uint64_t*)red_Y, (const uint64_t*)red_X, m, k0);
+
+    ret = 1;
+
+err:
+    if (storage != NULL) {
+        // Clear whole storage
+        OPENSSL_cleanse(storage, storage_len_bytes);
+        OPENSSL_free(storage);
+    }
+
+#undef DAMS
+    return ret;
+}
+
+// Compute the digit represented by the bytes given in |in|.
+OPENSSL_INLINE uint64_t get_digit(const uint8_t *in, int in_len)
+{
+    uint64_t digit = 0;
+
+    assert(in != NULL);
+    assert(in_len <= 8);
+
+    for (; in_len > 0; in_len--) {
+        digit <<= 8;
+        digit += (uint64_t)(in[in_len - 1]);
+    }
+    return digit;
+}
+
+// Convert array of words in regular (base=2^64) representation to
+// array of words in redundant (base=2^52) one. This is because the
+// multiply/add instruction uses 52-bit representations to leave room
+// for carries.
+static void to_words52(uint64_t *out, int out_len,
+                       const uint64_t *in, int in_bitsize)
+{
+    uint8_t *in_str = NULL;
+
+    assert(out != NULL);
+    assert(in != NULL);
+    // Check destination buffer capacity
+    assert(out_len >= number_of_digits(in_bitsize, DIGIT_SIZE));
+
+    in_str = (uint8_t *)in;
+
+    for (; in_bitsize >= (2 * DIGIT_SIZE); in_bitsize -= (2 * DIGIT_SIZE), out += 2) {
+        uint64_t digit;
+
+        memcpy(&digit, in_str, sizeof(digit));
+        out[0] = digit & DIGIT_MASK;
+        in_str += 6;
+        memcpy(&digit, in_str, sizeof(digit));
+        out[1] = (digit >> 4) & DIGIT_MASK;
+        in_str += 7;
+        out_len -= 2;
+    }
+
+    if (in_bitsize > DIGIT_SIZE) {
+        uint64_t digit = get_digit(in_str, 7);
+
+        out[0] = digit & DIGIT_MASK;
+        in_str += 6;
+        in_bitsize -= DIGIT_SIZE;
+        digit = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
+        out[1] = digit >> 4;
+        out += 2;
+        out_len -= 2;
+    } else if (in_bitsize > 0) {
+        out[0] = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
+        out++;
+        out_len--;
+    }
+
+    while (out_len > 0) {
+        *out = 0;
+        out_len--;
+        out++;
+    }
+}
+
+// Convert a 64-bit unsigned integer into a byte array, |out|, which
+// is in little-endian order.
+OPENSSL_INLINE void put_digit(uint8_t *out, int out_len, uint64_t digit)
+{
+    assert(out != NULL);
+    assert(out_len <= 8);
+
+    for (; out_len > 0; out_len--) {
+        *out++ = (uint8_t)(digit & 0xFF);
+        digit >>= 8;
+    }
+}
+
+// Convert array of words in redundant (base=2^52) representation to
+// array of words in regular (base=2^64) one. This is because the
+// multiply/add instruction uses 52-bit representations to leave room
+// for carries.
+static void from_words52(uint64_t *out, int out_bitsize, const uint64_t *in)
+{
+    int i;
+    int out_len = BITS2WORD64_SIZE(out_bitsize);
+
+    assert(out != NULL);
+    assert(in != NULL);
+
+    for (i = 0; i < out_len; i++)
+        out[i] = 0;
+
+    {
+        uint8_t *out_str = (uint8_t *)out;
+
+        for (; out_bitsize >= (2 * DIGIT_SIZE);
+               out_bitsize -= (2 * DIGIT_SIZE), in += 2) {
+            uint64_t digit;
+
+            digit = in[0];
+            memcpy(out_str, &digit, sizeof(digit));
+            out_str += 6;
+            digit = digit >> 48 | in[1] << 4;
+            memcpy(out_str, &digit, sizeof(digit));
+            out_str += 7;
+        }
+
+        if (out_bitsize > DIGIT_SIZE) {
+            put_digit(out_str, 7, in[0]);
+            out_str += 6;
+            out_bitsize -= DIGIT_SIZE;
+            put_digit(out_str, BITS2WORD8_SIZE(out_bitsize),
+                        (in[1] << 4 | in[0] >> 48));
+        } else if (out_bitsize) {
+            put_digit(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]);
+        }
+    }
+}
+
+// Set bit at index |idx| in the words array |a|. It does not do any
+// boundaries checks, make sure the index is valid before calling the
+// function.
+OPENSSL_INLINE void set_bit(uint64_t *a, int idx)
+{
+    assert(a != NULL);
+
+    {
+        int i, j;
+
+        i = idx / BN_BITS2;
+        j = idx % BN_BITS2;
+        a[i] |= (((uint64_t)1) << j);
+    }
+}
+
+#endif
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/shift.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/shift.c
@@ -0,0 +1,311 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/type_check.h>
+
+#include "internal.h"
+
+
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) {
+  int i, nw, lb, rb;
+  BN_ULONG *t, *f;
+  BN_ULONG l;
+
+  if (n < 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  r->neg = a->neg;
+  nw = n / BN_BITS2;
+  if (!bn_wexpand(r, a->width + nw + 1)) {
+    return 0;
+  }
+  lb = n % BN_BITS2;
+  rb = BN_BITS2 - lb;
+  f = a->d;
+  t = r->d;
+  t[a->width + nw] = 0;
+  if (lb == 0) {
+    for (i = a->width - 1; i >= 0; i--) {
+      t[nw + i] = f[i];
+    }
+  } else {
+    for (i = a->width - 1; i >= 0; i--) {
+      l = f[i];
+      t[nw + i + 1] |= l >> rb;
+      t[nw + i] = l << lb;
+    }
+  }
+  OPENSSL_memset(t, 0, nw * sizeof(t[0]));
+  r->width = a->width + nw + 1;
+  bn_set_minimal_width(r);
+
+  return 1;
+}
+
+int BN_lshift1(BIGNUM *r, const BIGNUM *a) {
+  BN_ULONG *ap, *rp, t, c;
+  int i;
+
+  if (r != a) {
+    r->neg = a->neg;
+    if (!bn_wexpand(r, a->width + 1)) {
+      return 0;
+    }
+    r->width = a->width;
+  } else {
+    if (!bn_wexpand(r, a->width + 1)) {
+      return 0;
+    }
+  }
+  ap = a->d;
+  rp = r->d;
+  c = 0;
+  for (i = 0; i < a->width; i++) {
+    t = *(ap++);
+    *(rp++) = (t << 1) | c;
+    c = t >> (BN_BITS2 - 1);
+  }
+  if (c) {
+    *rp = 1;
+    r->width++;
+  }
+
+  return 1;
+}
+
+void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift,
+                     size_t num) {
+  unsigned shift_bits = shift % BN_BITS2;
+  size_t shift_words = shift / BN_BITS2;
+  if (shift_words >= num) {
+    OPENSSL_memset(r, 0, num * sizeof(BN_ULONG));
+    return;
+  }
+  if (shift_bits == 0) {
+    OPENSSL_memmove(r, a + shift_words, (num - shift_words) * sizeof(BN_ULONG));
+  } else {
+    for (size_t i = shift_words; i < num - 1; i++) {
+      r[i - shift_words] =
+          (a[i] >> shift_bits) | (a[i + 1] << (BN_BITS2 - shift_bits));
+    }
+    r[num - 1 - shift_words] = a[num - 1] >> shift_bits;
+  }
+  OPENSSL_memset(r + num - shift_words, 0, shift_words * sizeof(BN_ULONG));
+}
+
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
+  if (n < 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+
+  if (!bn_wexpand(r, a->width)) {
+    return 0;
+  }
+  bn_rshift_words(r->d, a->d, n, a->width);
+  r->neg = a->neg;
+  r->width = a->width;
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a, unsigned n,
+                           BN_CTX *ctx) {
+  int ret = 0;
+  BN_CTX_start(ctx);
+  BIGNUM *tmp = BN_CTX_get(ctx);
+  if (tmp == NULL ||
+      !BN_copy(r, a) ||
+      !bn_wexpand(tmp, r->width)) {
+    goto err;
+  }
+
+  // Shift conditionally by powers of two.
+  unsigned max_bits = BN_BITS2 * r->width;
+  for (unsigned i = 0; (max_bits >> i) != 0; i++) {
+    BN_ULONG mask = (n >> i) & 1;
+    mask = 0 - mask;
+    bn_rshift_words(tmp->d, r->d, 1u << i, r->width);
+    bn_select_words(r->d, mask, tmp->d /* apply shift */,
+                    r->d /* ignore shift */, r->width);
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num) {
+  if (num == 0) {
+    return;
+  }
+  for (size_t i = 0; i < num - 1; i++) {
+    r[i] = (a[i] >> 1) | (a[i + 1] << (BN_BITS2 - 1));
+  }
+  r[num - 1] = a[num - 1] >> 1;
+}
+
+int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
+  if (!bn_wexpand(r, a->width)) {
+    return 0;
+  }
+  bn_rshift1_words(r->d, a->d, a->width);
+  r->width = a->width;
+  r->neg = a->neg;
+  bn_set_minimal_width(r);
+  return 1;
+}
+
+int BN_set_bit(BIGNUM *a, int n) {
+  if (n < 0) {
+    return 0;
+  }
+
+  int i = n / BN_BITS2;
+  int j = n % BN_BITS2;
+  if (a->width <= i) {
+    if (!bn_wexpand(a, i + 1)) {
+      return 0;
+    }
+    for (int k = a->width; k < i + 1; k++) {
+      a->d[k] = 0;
+    }
+    a->width = i + 1;
+  }
+
+  a->d[i] |= (((BN_ULONG)1) << j);
+
+  return 1;
+}
+
+int BN_clear_bit(BIGNUM *a, int n) {
+  int i, j;
+
+  if (n < 0) {
+    return 0;
+  }
+
+  i = n / BN_BITS2;
+  j = n % BN_BITS2;
+  if (a->width <= i) {
+    return 0;
+  }
+
+  a->d[i] &= (~(((BN_ULONG)1) << j));
+  bn_set_minimal_width(a);
+  return 1;
+}
+
+int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit) {
+  size_t i = bit / BN_BITS2;
+  size_t j = bit % BN_BITS2;
+  if (i >= num) {
+    return 0;
+  }
+  return (a[i] >> j) & 1;
+}
+
+int BN_is_bit_set(const BIGNUM *a, int n) {
+  if (n < 0) {
+    return 0;
+  }
+  return bn_is_bit_set_words(a->d, a->width, n);
+}
+
+int BN_mask_bits(BIGNUM *a, int n) {
+  if (n < 0) {
+    return 0;
+  }
+
+  int w = n / BN_BITS2;
+  int b = n % BN_BITS2;
+  if (w >= a->width) {
+    return 1;
+  }
+  if (b == 0) {
+    a->width = w;
+  } else {
+    a->width = w + 1;
+    a->d[w] &= ~(BN_MASK2 << b);
+  }
+
+  bn_set_minimal_width(a);
+  return 1;
+}
+
+static int bn_count_low_zero_bits_word(BN_ULONG l) {
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  OPENSSL_STATIC_ASSERT(sizeof(int) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  OPENSSL_STATIC_ASSERT(BN_BITS2 == sizeof(BN_ULONG) * 8,
+                        BN_ULONG_has_padding_bits)
+  // C has very bizarre rules for types smaller than an int.
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) >= sizeof(int),
+                        BN_ULONG_gets_promoted_to_int)
+
+  crypto_word_t mask;
+  int bits = 0;
+
+#if BN_BITS2 > 32
+  // Check if the lower half of |x| are all zero.
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 32));
+  // If the lower half is all zeros, it is included in the bit count and we
+  // count the upper half. Otherwise, we count the lower half.
+  bits += 32 & mask;
+  l = constant_time_select_w(mask, l >> 32, l);
+#endif
+
+  // The remaining blocks are analogous iterations at lower powers of two.
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 16));
+  bits += 16 & mask;
+  l = constant_time_select_w(mask, l >> 16, l);
+
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 8));
+  bits += 8 & mask;
+  l = constant_time_select_w(mask, l >> 8, l);
+
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 4));
+  bits += 4 & mask;
+  l = constant_time_select_w(mask, l >> 4, l);
+
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 2));
+  bits += 2 & mask;
+  l = constant_time_select_w(mask, l >> 2, l);
+
+  mask = constant_time_is_zero_w(l << (BN_BITS2 - 1));
+  bits += 1 & mask;
+
+  return bits;
+}
+
+int BN_count_low_zero_bits(const BIGNUM *bn) {
+  OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+  OPENSSL_STATIC_ASSERT(sizeof(int) <= sizeof(crypto_word_t),
+                        crypto_word_t_is_too_small)
+
+  int ret = 0;
+  crypto_word_t saw_nonzero = 0;
+  for (int i = 0; i < bn->width; i++) {
+    crypto_word_t nonzero = ~constant_time_is_zero_w(bn->d[i]);
+    crypto_word_t first_nonzero = ~saw_nonzero & nonzero;
+    saw_nonzero |= nonzero;
+
+    int bits = bn_count_low_zero_bits_word(bn->d[i]);
+    ret |= first_nonzero & (i * BN_BITS2 + bits);
+  }
+
+  // If got to the end of |bn| and saw no non-zero words, |bn| is zero. |ret|
+  // will then remain zero.
+  return ret;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/sqrt.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/sqrt.c
@@ -0,0 +1,450 @@
+// Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de> and Bodo Moeller for the OpenSSL project.
+// Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
+  // Compute a square root of |a| mod |p| using the Tonelli/Shanks algorithm
+  // (cf. Henri Cohen, "A Course in Algebraic Computational Number Theory",
+  // algorithm 1.5.1). |p| must be prime, otherwise an error or
+  // an incorrect "result" will be returned.
+
+  BIGNUM *ret = in;
+  int err = 1;
+  int r;
+  BIGNUM *A, *b, *q, *t, *x, *y;
+  int e, i, j;
+
+  if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
+    if (BN_abs_is_word(p, 2)) {
+      if (ret == NULL) {
+        ret = BN_new();
+      }
+      if (ret == NULL ||
+          !BN_set_word(ret, BN_is_bit_set(a, 0))) {
+        if (ret != in) {
+          BN_free(ret);
+        }
+        return NULL;
+      }
+      return ret;
+    }
+
+    OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
+    return NULL;
+  }
+
+  if (BN_is_zero(a) || BN_is_one(a)) {
+    if (ret == NULL) {
+      ret = BN_new();
+    }
+    if (ret == NULL ||
+        !BN_set_word(ret, BN_is_one(a))) {
+      if (ret != in) {
+        BN_free(ret);
+      }
+      return NULL;
+    }
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  b = BN_CTX_get(ctx);
+  q = BN_CTX_get(ctx);
+  t = BN_CTX_get(ctx);
+  x = BN_CTX_get(ctx);
+  y = BN_CTX_get(ctx);
+  if (y == NULL) {
+    goto end;
+  }
+
+  if (ret == NULL) {
+    ret = BN_new();
+  }
+  if (ret == NULL) {
+    goto end;
+  }
+
+  // A = a mod p
+  if (!BN_nnmod(A, a, p, ctx)) {
+    goto end;
+  }
+
+  // now write  |p| - 1  as  2^e*q  where  q  is odd
+  e = 1;
+  while (!BN_is_bit_set(p, e)) {
+    e++;
+  }
+  // we'll set  q  later (if needed)
+
+  if (e == 1) {
+    // The easy case:  (|p|-1)/2  is odd, so 2 has an inverse
+    // modulo  (|p|-1)/2,  and square roots can be computed
+    // directly by modular exponentiation.
+    // We have
+    //     2 * (|p|+1)/4 == 1   (mod (|p|-1)/2),
+    // so we can use exponent  (|p|+1)/4,  i.e.  (|p|-3)/4 + 1.
+    if (!BN_rshift(q, p, 2)) {
+      goto end;
+    }
+    q->neg = 0;
+    if (!BN_add_word(q, 1) ||
+        !BN_mod_exp_mont(ret, A, q, p, ctx, NULL)) {
+      goto end;
+    }
+    err = 0;
+    goto vrfy;
+  }
+
+  if (e == 2) {
+    // |p| == 5  (mod 8)
+    //
+    // In this case  2  is always a non-square since
+    // Legendre(2,p) = (-1)^((p^2-1)/8)  for any odd prime.
+    // So if  a  really is a square, then  2*a  is a non-square.
+    // Thus for
+    //      b := (2*a)^((|p|-5)/8),
+    //      i := (2*a)*b^2
+    // we have
+    //     i^2 = (2*a)^((1 + (|p|-5)/4)*2)
+    //         = (2*a)^((p-1)/2)
+    //         = -1;
+    // so if we set
+    //      x := a*b*(i-1),
+    // then
+    //     x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
+    //         = a^2 * b^2 * (-2*i)
+    //         = a*(-i)*(2*a*b^2)
+    //         = a*(-i)*i
+    //         = a.
+    //
+    // (This is due to A.O.L. Atkin,
+    // <URL:
+    //http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
+    // November 1992.)
+
+    // t := 2*a
+    if (!bn_mod_lshift1_consttime(t, A, p, ctx)) {
+      goto end;
+    }
+
+    // b := (2*a)^((|p|-5)/8)
+    if (!BN_rshift(q, p, 3)) {
+      goto end;
+    }
+    q->neg = 0;
+    if (!BN_mod_exp_mont(b, t, q, p, ctx, NULL)) {
+      goto end;
+    }
+
+    // y := b^2
+    if (!BN_mod_sqr(y, b, p, ctx)) {
+      goto end;
+    }
+
+    // t := (2*a)*b^2 - 1
+    if (!BN_mod_mul(t, t, y, p, ctx) ||
+        !BN_sub_word(t, 1)) {
+      goto end;
+    }
+
+    // x = a*b*t
+    if (!BN_mod_mul(x, A, b, p, ctx) ||
+        !BN_mod_mul(x, x, t, p, ctx)) {
+      goto end;
+    }
+
+    if (!BN_copy(ret, x)) {
+      goto end;
+    }
+    err = 0;
+    goto vrfy;
+  }
+
+  // e > 2, so we really have to use the Tonelli/Shanks algorithm.
+  // First, find some  y  that is not a square.
+  if (!BN_copy(q, p)) {
+    goto end;  // use 'q' as temp
+  }
+  q->neg = 0;
+  i = 2;
+  do {
+    // For efficiency, try small numbers first;
+    // if this fails, try random numbers.
+    if (i < 22) {
+      if (!BN_set_word(y, i)) {
+        goto end;
+      }
+    } else {
+      if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) {
+        goto end;
+      }
+      if (BN_ucmp(y, p) >= 0) {
+        if (!(p->neg ? BN_add : BN_sub)(y, y, p)) {
+          goto end;
+        }
+      }
+      // now 0 <= y < |p|
+      if (BN_is_zero(y)) {
+        if (!BN_set_word(y, i)) {
+          goto end;
+        }
+      }
+    }
+
+    r = bn_jacobi(y, q, ctx);  // here 'q' is |p|
+    if (r < -1) {
+      goto end;
+    }
+    if (r == 0) {
+      // m divides p
+      OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
+      goto end;
+    }
+  } while (r == 1 && ++i < 82);
+
+  if (r != -1) {
+    // Many rounds and still no non-square -- this is more likely
+    // a bug than just bad luck.
+    // Even if  p  is not prime, we should have found some  y
+    // such that r == -1.
+    OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS);
+    goto end;
+  }
+
+  // Here's our actual 'q':
+  if (!BN_rshift(q, q, e)) {
+    goto end;
+  }
+
+  // Now that we have some non-square, we can find an element
+  // of order  2^e  by computing its q'th power.
+  if (!BN_mod_exp_mont(y, y, q, p, ctx, NULL)) {
+    goto end;
+  }
+  if (BN_is_one(y)) {
+    OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
+    goto end;
+  }
+
+  // Now we know that (if  p  is indeed prime) there is an integer
+  // k,  0 <= k < 2^e,  such that
+  //
+  //      a^q * y^k == 1   (mod p).
+  //
+  // As  a^q  is a square and  y  is not,  k  must be even.
+  // q+1  is even, too, so there is an element
+  //
+  //     X := a^((q+1)/2) * y^(k/2),
+  //
+  // and it satisfies
+  //
+  //     X^2 = a^q * a     * y^k
+  //         = a,
+  //
+  // so it is the square root that we are looking for.
+
+  // t := (q-1)/2  (note that  q  is odd)
+  if (!BN_rshift1(t, q)) {
+    goto end;
+  }
+
+  // x := a^((q-1)/2)
+  if (BN_is_zero(t)) {  // special case: p = 2^e + 1
+    if (!BN_nnmod(t, A, p, ctx)) {
+      goto end;
+    }
+    if (BN_is_zero(t)) {
+      // special case: a == 0  (mod p)
+      BN_zero(ret);
+      err = 0;
+      goto end;
+    } else if (!BN_one(x)) {
+      goto end;
+    }
+  } else {
+    if (!BN_mod_exp_mont(x, A, t, p, ctx, NULL)) {
+      goto end;
+    }
+    if (BN_is_zero(x)) {
+      // special case: a == 0  (mod p)
+      BN_zero(ret);
+      err = 0;
+      goto end;
+    }
+  }
+
+  // b := a*x^2  (= a^q)
+  if (!BN_mod_sqr(b, x, p, ctx) ||
+      !BN_mod_mul(b, b, A, p, ctx)) {
+    goto end;
+  }
+
+  // x := a*x    (= a^((q+1)/2))
+  if (!BN_mod_mul(x, x, A, p, ctx)) {
+    goto end;
+  }
+
+  while (1) {
+    // Now  b  is  a^q * y^k  for some even  k  (0 <= k < 2^E
+    // where  E  refers to the original value of  e,  which we
+    // don't keep in a variable),  and  x  is  a^((q+1)/2) * y^(k/2).
+    //
+    // We have  a*b = x^2,
+    //    y^2^(e-1) = -1,
+    //    b^2^(e-1) = 1.
+    if (BN_is_one(b)) {
+      if (!BN_copy(ret, x)) {
+        goto end;
+      }
+      err = 0;
+      goto vrfy;
+    }
+
+    // Find the smallest i, 0 < i < e, such that b^(2^i) = 1
+    for (i = 1; i < e; i++) {
+      if (i == 1) {
+        if (!BN_mod_sqr(t, b, p, ctx)) {
+          goto end;
+        }
+      } else {
+        if (!BN_mod_mul(t, t, t, p, ctx)) {
+          goto end;
+        }
+      }
+      if (BN_is_one(t)) {
+        break;
+      }
+    }
+    // If not found, a is not a square or p is not a prime.
+    if (i >= e) {
+      OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
+      goto end;
+    }
+
+    // t := y^2^(e - i - 1)
+    if (!BN_copy(t, y)) {
+      goto end;
+    }
+    for (j = e - i - 1; j > 0; j--) {
+      if (!BN_mod_sqr(t, t, p, ctx)) {
+        goto end;
+      }
+    }
+    if (!BN_mod_mul(y, t, t, p, ctx) ||
+        !BN_mod_mul(x, x, t, p, ctx) ||
+        !BN_mod_mul(b, b, y, p, ctx)) {
+      goto end;
+    }
+
+    // e decreases each iteration, so this loop will terminate.
+    assert(i < e);
+    e = i;
+  }
+
+vrfy:
+  if (!err) {
+    // Verify the result. The input might have been not a square.
+    if (!BN_mod_sqr(x, ret, p, ctx)) {
+      err = 1;
+    }
+
+    if (!err && 0 != BN_cmp(x, A)) {
+      OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
+      err = 1;
+    }
+  }
+
+end:
+  if (err) {
+    if (ret != in) {
+      BN_clear_free(ret);
+    }
+    ret = NULL;
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_sqrt(BIGNUM *out_sqrt, const BIGNUM *in, BN_CTX *ctx) {
+  BIGNUM *estimate, *tmp, *delta, *last_delta, *tmp2;
+  int ok = 0, last_delta_valid = 0;
+
+  if (in->neg) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
+    return 0;
+  }
+  if (BN_is_zero(in)) {
+    BN_zero(out_sqrt);
+    return 1;
+  }
+
+  BN_CTX_start(ctx);
+  if (out_sqrt == in) {
+    estimate = BN_CTX_get(ctx);
+  } else {
+    estimate = out_sqrt;
+  }
+  tmp = BN_CTX_get(ctx);
+  last_delta = BN_CTX_get(ctx);
+  delta = BN_CTX_get(ctx);
+  if (estimate == NULL || tmp == NULL || last_delta == NULL || delta == NULL) {
+    goto err;
+  }
+
+  // We estimate that the square root of an n-bit number is 2^{n/2}.
+  if (!BN_lshift(estimate, BN_value_one(), BN_num_bits(in)/2)) {
+    goto err;
+  }
+
+  // This is Newton's method for finding a root of the equation |estimate|^2 -
+  // |in| = 0.
+  for (;;) {
+    // |estimate| = 1/2 * (|estimate| + |in|/|estimate|)
+    if (!BN_div(tmp, NULL, in, estimate, ctx) ||
+        !BN_add(tmp, tmp, estimate) ||
+        !BN_rshift1(estimate, tmp) ||
+        // |tmp| = |estimate|^2
+        !BN_sqr(tmp, estimate, ctx) ||
+        // |delta| = |in| - |tmp|
+        !BN_sub(delta, in, tmp)) {
+      OPENSSL_PUT_ERROR(BN, ERR_R_BN_LIB);
+      goto err;
+    }
+
+    delta->neg = 0;
+    // The difference between |in| and |estimate| squared is required to always
+    // decrease. This ensures that the loop always terminates, but I don't have
+    // a proof that it always finds the square root for a given square.
+    if (last_delta_valid && BN_cmp(delta, last_delta) >= 0) {
+      break;
+    }
+
+    last_delta_valid = 1;
+
+    tmp2 = last_delta;
+    last_delta = delta;
+    delta = tmp2;
+  }
+
+  if (BN_cmp(tmp, in) != 0) {
+    OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
+    goto err;
+  }
+
+  ok = 1;
+
+err:
+  if (ok && out_sqrt == in && !BN_copy(out_sqrt, estimate)) {
+    ok = 0;
+  }
+  BN_CTX_end(ctx);
+  return ok;
+}