chore: checkpoint before Python removal

2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -0,0 +1,733 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+# September 2015
+#
+# Align Cortex-A9 performance with November 2013 improvements, i.e.
+# NEON code is now ~20-105% faster than integer-only one on this
+# processor. But this optimization further improved performance even
+# on other processors: NEON code path is ~45-180% faster than original
+# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
+# Snapdragon S4.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.global	bn_mul_mont_nohw
+.type	bn_mul_mont_nohw,%function
+
+.align	5
+bn_mul_mont_nohw:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	mov	$tj,sp
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,$tj		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	$tj,sp
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	mov	$aj,sp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,$aj		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
+	ldr	$aj,[$rp]
+	str	sp,[$tp],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	$aj,$tj
+	str	$aj,[$rp],#4
+	teq	$tp,$num		@ preserve carry
+	bne	.Lcopy
+
+	mov	sp,$num
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont_nohw,.-bn_mul_mont_nohw
+___
+{
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my @ACC=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero="$Z#lo";
+my $temp="$Temp#lo";
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	bn_mul8x_mont_neon
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	$num,#8
+	bhi	.LNEON_8n
+
+	@ special case for $num==8, everything is in register bank...
+
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	sub		$toutptr,sp,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	vzip.16		$Bi,$zero
+
+	vmull.u32	@ACC[0],$Bi,${A0}[0]
+	vmull.u32	@ACC[1],$Bi,${A0}[1]
+	vmull.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmull.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmull.u32	@ACC[4],$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	@ACC[5],$Bi,${A2}[1]
+	vmull.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+
+	vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$Ni,$M0
+
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmov		$Temp,@ACC[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmov		@ACC[0],@ACC[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmov		@ACC[1],@ACC[2]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vmov		@ACC[2],@ACC[3]
+	vmov		@ACC[3],@ACC[4]
+	vshr.u64	$temp,$temp,#16
+	vmov		@ACC[4],@ACC[5]
+	vmov		@ACC[5],@ACC[6]
+	vadd.u64	$temp,$temp,$Temp#hi
+	vmov		@ACC[6],@ACC[7]
+	veor		@ACC[7],@ACC[7]
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	mov		$inner,$num
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	add		$tinptr,sp,#96
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	b	.LNEON_tail_entry
+
+.align	4
+.LNEON_8n:
+	veor		@ACC[0],@ACC[0],@ACC[0]
+	 sub		$toutptr,sp,#128
+	veor		@ACC[1],@ACC[1],@ACC[1]
+	 sub		$toutptr,$toutptr,$num,lsl#4
+	veor		@ACC[2],@ACC[2],@ACC[2]
+	 and		$toutptr,$toutptr,#-64
+	veor		@ACC[3],@ACC[3],@ACC[3]
+	 mov		sp,$toutptr			@ alloca
+	veor		@ACC[4],@ACC[4],@ACC[4]
+	 add		$toutptr,$toutptr,#256
+	veor		@ACC[5],@ACC[5],@ACC[5]
+	 sub		$inner,$num,#8
+	veor		@ACC[6],@ACC[6],@ACC[6]
+	veor		@ACC[7],@ACC[7],@ACC[7]
+
+.LNEON_8n_init:
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	subs		$inner,$inner,#8
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
+	bne		.LNEON_8n_init
+
+	add		$tinptr,sp,#256
+	vld1.32		{$A0-$A3},[$aptr]!
+	add		$bnptr,sp,#8
+	vld1.32		{${M0}[0]},[$n0,:32]
+	mov		$outer,$num
+	b		.LNEON_8n_outer
+
+.align	4
+.LNEON_8n_outer:
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	add		$toutptr,sp,#128
+	vld1.32		{$N0-$N3},[$nptr]!
+
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=0; $i<7;) {
+$code.=<<___;
+	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	veor		$temp,$temp,$temp
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vzip.16		$Bi,$temp
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
+___
+	push(@ACC,shift(@ACC));	$i++;
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]!
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	 veor		$zero,$zero,$zero
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	 vshl.i64	$Ni,@ACC[0]#hi,#16
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	 vmul.u32	$Ni,$Ni,$M0
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	 vzip.16	$Ni,$zero
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
+	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
+	add		$bnptr,sp,#8		@ rewind
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	sub		$inner,$num,#8
+	b		.LNEON_8n_inner
+
+.align	4
+.LNEON_8n_inner:
+	subs		$inner,$inner,#8
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	vld1.32		{$N0-$N3},[$nptr]!
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vmlal.u32	@ACC[0],$Bi,${A0}[0]
+	vld1.64		{@ACC[7]},[$tinptr,:128]
+	vmlal.u32	@ACC[1],$Bi,${A0}[1]
+	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
+	vmlal.u32	@ACC[2],$Bi,${A1}[0]
+	it		ne
+	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
+	vmlal.u32	@ACC[3],$Bi,${A1}[1]
+	vmlal.u32	@ACC[4],$Bi,${A2}[0]
+	vmlal.u32	@ACC[5],$Bi,${A2}[1]
+	vmlal.u32	@ACC[6],$Bi,${A3}[0]
+	vmlal.u32	@ACC[7],$Bi,${A3}[1]
+___
+}
+$code.=<<___;
+	it		eq
+	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
+	vmlal.u32	@ACC[0],$Ni,${N0}[0]
+	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	@ACC[1],$Ni,${N0}[1]
+	vld1.32		{$A0-$A3},[$aptr]!
+	vmlal.u32	@ACC[2],$Ni,${N1}[0]
+	add		$bnptr,sp,#8		@ rewind
+	vmlal.u32	@ACC[3],$Ni,${N1}[1]
+	vmlal.u32	@ACC[4],$Ni,${N2}[0]
+	vmlal.u32	@ACC[5],$Ni,${N2}[1]
+	vmlal.u32	@ACC[6],$Ni,${N3}[0]
+	vst1.64		{@ACC[0]},[$toutptr,:128]!
+	vmlal.u32	@ACC[7],$Ni,${N3}[1]
+
+	bne		.LNEON_8n_inner
+___
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	add		$tinptr,sp,#128
+	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
+	veor		q2,q2,q2		@ $N0-$N1
+	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
+	veor		q3,q3,q3		@ $N2-$N3
+	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
+	vst1.64		{@ACC[6]},[$toutptr,:128]
+
+	subs		$outer,$outer,#8
+	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
+	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
+	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
+	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
+
+	itt		ne
+	subne		$nptr,$nptr,$num,lsl#2	@ rewind
+	bne		.LNEON_8n_outer
+
+	add		$toutptr,sp,#128
+	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vst1.64		{q2-q3},[sp,:256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vst1.64		{q2-q3}, [sp,:256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vst1.64		{q2-q3}, [sp,:256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+	mov		$inner,$num
+	b		.LNEON_tail_entry
+
+.align	4
+.LNEON_tail:
+	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
+	vshr.u64	$temp,@ACC[0]#lo,#16
+	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
+	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
+	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
+	vshr.u64	$temp,@ACC[0]#hi,#16
+	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
+	vzip.16		@ACC[0]#lo,@ACC[0]#hi
+
+.LNEON_tail_entry:
+___
+for ($i=1; $i<8; $i++) {
+$code.=<<___;
+	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
+	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,@ACC[1]#lo,#16
+	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
+	vshr.u64	$temp,@ACC[1]#hi,#16
+	vzip.16		@ACC[1]#lo,@ACC[1]#hi
+___
+	push(@ACC,shift(@ACC));
+}
+	push(@ACC,shift(@ACC));
+$code.=<<___;
+	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
+	subs		$inner,$inner,#8
+	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,$bptr,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	mov	sp,ip
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	ret						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
+	s/\bret\b/bx    lr/g						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv8-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/armv8-mont.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -0,0 +1,574 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("maw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+		&jmp(&label("maw_sse2_entry"));
+
+	&set_label("maw_sse2_unrolled",16);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&lea($r,&DWP(32,$r));
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("maw_sse2_loop"));
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
+		&emms();
+		&ret();
+
+	&set_label("maw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+= c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a));	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/bn-armv8.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+# Copyright (c) 2023, Google Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+use strict;
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3");
+my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8");
+my $code = <<____;
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_add_words, %function
+.globl	bn_add_words
+.align	4
+bn_add_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Ladd_tail
+.Ladd_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	adcs	$a0, $a0, $b0
+	adcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Ladd_loop
+
+.Ladd_tail:
+	cbz	$num, .Ladd_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	adcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Ladd_exit:
+	cset	x0, cs
+	ret
+.cfi_endproc
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_sub_words, %function
+.globl	bn_sub_words
+.align	4
+bn_sub_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	$num_pairs, $num, #1
+	and	$num, $num, #1
+
+	cbz	$num_pairs, .Lsub_tail
+.Lsub_loop:
+	ldp	$a0, $a1, [$ap], #16
+	ldp	$b0, $b1, [$bp], #16
+	sub	$num_pairs, $num_pairs, #1
+	sbcs	$a0, $a0, $b0
+	sbcs	$a1, $a1, $b1
+	stp	$a0, $a1, [$rp], #16
+	cbnz	$num_pairs, .Lsub_loop
+
+.Lsub_tail:
+	cbz	$num, .Lsub_exit
+	ldr	$a0, [$ap], #8
+	ldr	$b0, [$bp], #8
+	sbcs	$a0, $a0, $b0
+	str	$a0, [$rp], #8
+
+.Lsub_exit:
+	cset x0, cc
+	ret
+.cfi_endproc
+.size   bn_sub_words,.-bn_sub_words
+____
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/co-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/co-586.pl
@@ -0,0 +1,298 @@
+#! /usr/bin/env perl
+# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i");
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
@@ -0,0 +1,698 @@
+# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2020, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov.
+# Special thanks to Ilya Albrekht for his valuable hints.
+# Intel Corporation
+#
+# December 2020
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+----------------------+--------------+-------------|
+# |         | OpenSSL 3.0.0-alpha9 | this         | Unit        |
+# |---------+----------------------+--------------+-------------|
+# | rsa2048 | 2 127 659            | 1 015 625    | cycles/sign |
+# |         | 611                  | 1280 / +109% | sign/s      |
+# |---------+----------------------+--------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19)));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23)));
+
+# Registers mapping for normalization.
+my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26)));
+
+sub amm52x20_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $zero, $_R2
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[bcd]x
+sub amm52x20_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of R0..R2
+    # Save them to LSB of QWs in T0..T2
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+
+    # "Shift left" T0..T2 by 1 QW
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..R2 QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+
+    # Sum R0..R2 with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs which 52-bit parts overflow...
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R0,  %k1 # OP=nle (i.e. gt)
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R0h, %k2
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R1,  %k3
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R1h, %k4
+    vpcmpuq   \$6, .Lmask52x4(%rip), $_R2,  %k5
+    kmovb   %k1, %r14d                   # k1
+    kmovb   %k2, %r13d                   # k1h
+    kmovb   %k3, %r12d                   # k2
+    kmovb   %k4, %r11d                   # k2h
+    kmovb   %k5, %r10d                   # k3
+
+    # ...or saturated
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R0,  %k1 # OP=eq
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R0h, %k2
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R1,  %k3
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R1h, %k4
+    vpcmpuq   \$0, .Lmask52x4(%rip), $_R2,  %k5
+    kmovb   %k1, %r9d                    # k4
+    kmovb   %k2, %r8d                    # k4h
+    kmovb   %k3, %ebx                    # k5
+    kmovb   %k4, %ecx                    # k5h
+    kmovb   %k5, %edx                    # k6
+
+    # Get mask of QWs where carries shall be propagated to.
+    # Merge 4-bit masks to 8-bit values to use add with carry.
+    shl   \$4, %r13b
+    or    %r13b, %r14b
+    shl   \$4, %r11b
+    or    %r11b, %r12b
+
+    add   %r14b, %r14b
+    adc   %r12b, %r12b
+    adc   %r10b, %r10b
+
+    shl   \$4, %r8b
+    or    %r8b,%r9b
+    shl   \$4, %cl
+    or    %cl, %bl
+
+    add   %r9b, %r14b
+    adc   %bl, %r12b
+    adc   %dl, %r10b
+
+    xor   %r9b, %r14b
+    xor   %bl, %r12b
+    xor   %dl, %r10b
+
+    kmovb   %r14d, %k1
+    shr     \$4, %r14b
+    kmovb   %r14d, %k2
+    kmovb   %r12d, %k3
+    shr     \$4, %r12b
+    kmovb   %r12d, %k4
+    kmovb   %r10d, %k5
+
+    # Add carries according to the obtained mask
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+
+    vpandq   .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq   .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq   .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq   .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq   .Lmask52x4(%rip), $_R2,  $_R2
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x20_x1_ifma256
+.type   rsaz_amm52x20_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x20_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+.Lrsaz_amm52x20_x1_ifma256_body:
+
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 20 digits unrolled by 4
+    mov     \$5, $iter
+
+.align 32
+.Lloop5:
+___
+    foreach my $idx (0..3) {
+        &amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop5
+___
+    &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+
+    vzeroupper
+    mov  0(%rsp),%r15
+.cfi_restore    %r15
+    mov  8(%rsp),%r14
+.cfi_restore    %r14
+    mov  16(%rsp),%r13
+.cfi_restore    %r13
+    mov  24(%rsp),%r12
+.cfi_restore    %r12
+    mov  32(%rsp),%rbp
+.cfi_restore    %rbp
+    mov  40(%rsp),%rbx
+.cfi_restore    %rbx
+    lea  48(%rsp),%rsp
+.cfi_adjust_cfa_offset  -48
+.Lrsaz_amm52x20_x1_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
+#                               const BN_ULONG a[2][20],
+#                               const BN_ULONG b[2][20],
+#                               const BN_ULONG m[2][20],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x20_x2_ifma256
+.type   rsaz_amm52x20_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x20_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+.Lrsaz_amm52x20_x2_ifma256_body:
+
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$20, $iter
+
+.align 32
+.Lloop20:
+___
+    &amm52x20_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)");
+    # 20*8 = offset of the next dimension in two-dimension array
+    &amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop20
+___
+    &amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
+    &amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+
+    vmovdqu64   $R0_1,  `5*32`($res)
+    vmovdqu64   $R0_1h, `6*32`($res)
+    vmovdqu64   $R1_1,  `7*32`($res)
+    vmovdqu64   $R1_1h, `8*32`($res)
+    vmovdqu64   $R2_1,  `9*32`($res)
+
+    vzeroupper
+    mov  0(%rsp),%r15
+.cfi_restore    %r15
+    mov  8(%rsp),%r14
+.cfi_restore    %r14
+    mov  16(%rsp),%r13
+.cfi_restore    %r13
+    mov  24(%rsp),%r12
+.cfi_restore    %r12
+    mov  32(%rsp),%rbp
+.cfi_restore    %rbp
+    mov  40(%rsp),%rbx
+.cfi_restore    %rbx
+    lea  48(%rsp),%rsp
+.cfi_adjust_cfa_offset  -48
+.Lrsaz_amm52x20_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x20_x2_ifma256, .-rsaz_amm52x20_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x20_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x20_win5
+.type   extract_multiplier_2x20_win5,\@abi-omnipotent
+extract_multiplier_2x20_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*20*8`($red_tbl), %rax  # holds end of the tbl
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+    vmovdqa64   $t0, $cur_idx
+___
+foreach (1..9) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+$code.=<<___;
+
+.align 32
+.Lloop:
+    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
+    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
+___
+foreach (0..9) {
+    my $mask = $_<5?"%k1":"%k2";
+$code.=<<___;
+    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
+    addq    \$`2*20*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop
+___
+# store t0..n
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
+}
+$code.=<<___;
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x20_win5, .-extract_multiplier_2x20_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_def_handler,\@abi-omnipotent
+.align  16
+rsaz_def_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    lea     48(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_def_handler,.-rsaz_def_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x20_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_def_handler
+    .rva    .Lrsaz_amm52x20_x1_ifma256_body,.Lrsaz_amm52x20_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x20_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_def_handler
+    .rva    .Lrsaz_amm52x20_x2_ifma256_body,.Lrsaz_amm52x20_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+.globl  rsaz_amm52x20_x1_ifma256
+.globl  rsaz_amm52x20_x2_ifma256
+.globl  extract_multiplier_2x20_win5
+.type   rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x20_x1_ifma256:
+rsaz_amm52x20_x2_ifma256:
+extract_multiplier_2x20_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
@@ -0,0 +1,854 @@
+# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov
+# Intel Corporation
+#
+# March 2021
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+-----------------------+---------------+-------------|
+# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
+# |---------+-----------------------+---------------+-------------|
+# | rsa3072 | 6 397 637             | 2 866 593     | cycles/sign |
+# |         | 203.2                 | 453.5 / +123% | sign/s      |
+# |---------+-----------------------+---------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
+
+# Registers mapping for normalization
+my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
+
+sub amm52x30_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $_R2h, $_R2
+    valignq     \$1, $_R2h, $_R3, $_R2h
+    valignq     \$1, $_R3, $_R3h, $_R3
+    valignq     \$1, $_R3h, $zero, $_R3h
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[abcd]x
+sub amm52x30_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of the bignum
+    # Save them to LSB of QWs in T0..Tn
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+    vpsrlq    \$52, $_R2h,  $T2h
+    vpsrlq    \$52, $_R3,   $T3
+    vpsrlq    \$52, $_R3h,  $T3h
+
+    # "Shift left" T0..Tn by 1 QW
+    valignq \$3, $T3,  $T3h,  $T3h
+    valignq \$3, $T2h,  $T3,  $T3
+    valignq \$3, $T2,  $T2h,  $T2h
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..Rn QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
+    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
+
+    # Sum R0..Rn with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+    vpaddq  $T2h, $_R2h, $_R2h
+    vpaddq  $T3,  $_R3,  $_R3
+    vpaddq  $T3h, $_R3h, $_R3h
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs whose 52-bit parts overflow
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r14d
+    kmovb      %k2,%r13d
+    shl        \$4,%r13b
+    or         %r13b,%r14b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r13d
+    kmovb      %k2,%r12d
+    shl        \$4,%r12b
+    or         %r12b,%r13b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%r12d
+    kmovb      %k2,%r11d
+    shl        \$4,%r11b
+    or         %r11b,%r12b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%r11d
+    kmovb      %k2,%r10d
+    shl        \$4,%r10b
+    or         %r10b,%r11b
+
+    addb       %r14b,%r14b
+    adcb       %r13b,%r13b
+    adcb       %r12b,%r12b
+    adcb       %r11b,%r11b
+
+    # Get mask of QWs whose 52-bit parts saturated
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r9d
+    kmovb      %k2,%r8d
+    shl        \$4,%r8b
+    or         %r8b,%r9b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r8d
+    kmovb      %k2,%edx
+    shl        \$4,%dl
+    or         %dl,%r8b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%edx
+    kmovb      %k2,%ecx
+    shl        \$4,%cl
+    or         %cl,%dl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%ecx
+    kmovb      %k2,%ebx
+    shl        \$4,%bl
+    or         %bl,%cl
+
+    addb     %r9b,%r14b
+    adcb     %r8b,%r13b
+    adcb     %dl,%r12b
+    adcb     %cl,%r11b
+
+    xor      %r9b,%r14b
+    xor      %r8b,%r13b
+    xor      %dl,%r12b
+    xor      %cl,%r11b
+
+    kmovb    %r14d,%k1
+    shr      \$4,%r14b
+    kmovb    %r14d,%k2
+    kmovb    %r13d,%k3
+    shr      \$4,%r13b
+    kmovb    %r13d,%k4
+    kmovb    %r12d,%k5
+    shr      \$4,%r12b
+    kmovb    %r12d,%k6
+    kmovb    %r11d,%k7
+
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
+    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
+
+    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
+
+    shr    \$4,%r11b
+    kmovb   %r11d,%k1
+
+    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
+
+    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x30_x1_ifma256
+.type   rsaz_amm52x30_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x30_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x30_x1_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 30 digits unrolled by 4
+    mov     \$7, $iter
+
+.align 32
+.Lloop7:
+___
+    foreach my $idx (0..3) {
+        &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop7
+___
+    &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+    &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
+
+    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea  168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp       # restore rsp
+.cfi_def_cfa %rsp,8
+.Lrsaz_amm52x30_x1_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
+#                               const BN_ULONG a[2][32],
+#                               const BN_ULONG b[2][32],
+#                               const BN_ULONG m[2][32],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x30_x2_ifma256
+.type   rsaz_amm52x30_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x30_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp
+    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x30_x2_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+    vmovdqa64   $zero, $R2_1h
+    vmovdqa64   $zero, $R3_1
+    vmovdqa64   $zero, $R3_1h
+
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$30, $iter
+
+.align 32
+.Lloop30:
+___
+    &amm52x30_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
+    # 32*8 = offset of the next dimension in two-dimension array
+    &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop30
+___
+    &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
+    &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+
+    vmovdqu64   $R0_1,  `8*32`($res)
+    vmovdqu64   $R0_1h, `9*32`($res)
+    vmovdqu64   $R1_1,  `10*32`($res)
+    vmovdqu64   $R1_1h, `11*32`($res)
+    vmovdqu64   $R2_1,  `12*32`($res)
+    vmovdqu64   $R2_1h, `13*32`($res)
+    vmovdqu64   $R3_1,  `14*32`($res)
+    vmovdqu64   $R3_1h, `15*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea     168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp
+.cfi_def_cfa    %rsp,8
+.Lrsaz_amm52x30_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x30_x2_ifma256, .-rsaz_amm52x30_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x30_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x30_win5
+.type   extract_multiplier_2x30_win5,\@abi-omnipotent
+extract_multiplier_2x30_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*32*8`($red_tbl), %rax  # holds end of the tbl
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+    vmovdqa64   $t0, $cur_idx
+___
+foreach (1..15) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+$code.=<<___;
+
+.align 32
+.Lloop:
+    vpcmpq  \$0, $cur_idx, $idx1, %k1      # mask of (idx1 == cur_idx)
+    vpcmpq  \$0, $cur_idx, $idx2, %k2      # mask of (idx2 == cur_idx)
+___
+foreach (0..15) {
+    my $mask = $_<8?"%k1":"%k2";
+$code.=<<___;
+    vmovdqu64  `${_}*32`($red_tbl), $tmp     # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx      # increment cur_idx
+    addq    \$`2*32*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop
+___
+# store t0..n
+foreach (0..15) {
+    $code.="vmovdqu64   $t[$_], `${_}*32`($out) \n";
+}
+$code.=<<___;
+
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x30_win5, .-extract_multiplier_2x30_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_avx_handler,\@abi-omnipotent
+.align  16
+rsaz_avx_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    lea     (%rax),%rsi         # %xmm save area
+    lea     512($context),%rdi  # & context.Xmm6
+    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
+    .long   0xa548f3fc          # cld; rep movsq
+
+    lea     `48+168`(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_avx_handler,.-rsaz_avx_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x30_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x30_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x30_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x30_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x30_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x30_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x30_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x30_x1_ifma256_body,.Lrsaz_amm52x30_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x30_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x30_x2_ifma256_body,.Lrsaz_amm52x30_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x30_x1_ifma256
+.globl  rsaz_amm52x30_x2_ifma256
+.globl  extract_multiplier_2x30_win5
+.type   rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x30_x1_ifma256:
+rsaz_amm52x30_x2_ifma256:
+extract_multiplier_2x30_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
@@ -0,0 +1,915 @@
+# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Originally written by Sergey Kirillov and Andrey Matyukov
+# Intel Corporation
+#
+# March 2021
+#
+# Initial release.
+#
+# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
+#
+# IceLake-Client @ 1.3GHz
+# |---------+-----------------------+---------------+-------------|
+# |         | OpenSSL 3.0.0-alpha15 | this          | Unit        |
+# |---------+-----------------------+---------------+-------------|
+# | rsa4096 | 14 301 4300           | 5 813 953     | cycles/sign |
+# |         | 90.9                  | 223.6 / +146% | sign/s      |
+# |---------+-----------------------+---------------+-------------|
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$avx512ifma=1;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+    $avx512ifma = ($1>=2.26);
+}
+
+if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+    $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
+}
+
+if (!$avx512ifma && `$ENV{CC} -v 2>&1`
+    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
+    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
+    if ($1) {
+        # Apple conditions, they use a different version series, see
+        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
+        # clang 7.0.0 is Apple clang 10.0.1
+        $avx512ifma = ($ver>=10.0001)
+    } else {
+        $avx512ifma = ($ver>=7.0);
+    }
+}
+
+# In upstream, this is controlled by shelling out to the compiler to check
+# versions, but BoringSSL is intended to be used with pre-generated perlasm
+# output, so this isn't useful anyway.
+for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+if ($avx512ifma>0) {{{
+@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+###############################################################################
+# void rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
+#                               const BN_ULONG *a,
+#                               const BN_ULONG *b,
+#                               const BN_ULONG *m,
+#                               BN_ULONG k0);
+###############################################################################
+{
+# input parameters
+my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
+
+my $mask52     = "%rax";
+my $acc0_0     = "%r9";
+my $acc0_0_low = "%r9d";
+my $acc0_1     = "%r15";
+my $acc0_1_low = "%r15d";
+my $b_ptr      = "%r11";
+
+my $iter = "%ebx";
+
+my $zero = "%ymm0";
+my $Bi   = "%ymm1";
+my $Yi   = "%ymm2";
+my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12));
+my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22));
+
+# Registers mapping for normalization
+my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29)));
+
+sub amm52x40_x1() {
+# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
+#                of data for corresponding AMM operation;
+# _b_offset    - offset in the |b| array pointing to the next qword digit;
+my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_;
+my $_R0_xmm = $_R0;
+$_R0_xmm =~ s/%y/%x/;
+$code.=<<___;
+    movq    $_b_offset($b_ptr), %r13             # b[i]
+
+    vpbroadcastq    %r13, $Bi                    # broadcast b[i]
+    movq    $_data_offset($a), %rdx
+    mulx    %r13, %r13, %r12                     # a[0]*b[i] = (t0,t2)
+    addq    %r13, $_acc                          # acc += t0
+    movq    %r12, %r10
+    adcq    \$0, %r10                            # t2 += CF
+
+    movq    $_k0, %r13
+    imulq   $_acc, %r13                          # acc * k0
+    andq    $mask52, %r13                        # yi = (acc * k0) & mask52
+
+    vpbroadcastq    %r13, $Yi                    # broadcast y[i]
+    movq    $_data_offset($m), %rdx
+    mulx    %r13, %r13, %r12                     # yi * m[0] = (t0,t1)
+    addq    %r13, $_acc                          # acc += t0
+    adcq    %r12, %r10                           # t2 += (t1 + CF)
+
+    shrq    \$52, $_acc
+    salq    \$12, %r10
+    or      %r10, $_acc                          # acc = ((acc >> 52) | (t2 << 12))
+
+    vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+    vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4
+    vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h
+
+    vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+    vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4
+    vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h
+
+    # Shift accumulators right by 1 qword, zero extending the highest one
+    valignq     \$1, $_R0, $_R0h, $_R0
+    valignq     \$1, $_R0h, $_R1, $_R0h
+    valignq     \$1, $_R1, $_R1h, $_R1
+    valignq     \$1, $_R1h, $_R2, $_R1h
+    valignq     \$1, $_R2, $_R2h, $_R2
+    valignq     \$1, $_R2h, $_R3, $_R2h
+    valignq     \$1, $_R3, $_R3h, $_R3
+    valignq     \$1, $_R3h, $_R4, $_R3h
+    valignq     \$1, $_R4, $_R4h, $_R4
+    valignq     \$1, $_R4h, $zero, $_R4h
+
+    vmovq   $_R0_xmm, %r13
+    addq    %r13, $_acc    # acc += R0[0]
+
+    vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
+    vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4
+    vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h
+
+    vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
+    vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
+    vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
+    vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
+    vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
+    vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
+    vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
+    vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
+    vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4
+    vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h
+___
+}
+
+# Normalization routine: handles carry bits and gets bignum qwords to normalized
+# 2^52 representation.
+#
+# Uses %r8-14,%e[abcd]x
+sub amm52x40_x1_norm {
+my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_;
+$code.=<<___;
+    # Put accumulator to low qword in R0
+    vpbroadcastq    $_acc, $T0
+    vpblendd \$3, $T0, $_R0, $_R0
+
+    # Extract "carries" (12 high bits) from each QW of the bignum
+    # Save them to LSB of QWs in T0..Tn
+    vpsrlq    \$52, $_R0,   $T0
+    vpsrlq    \$52, $_R0h,  $T0h
+    vpsrlq    \$52, $_R1,   $T1
+    vpsrlq    \$52, $_R1h,  $T1h
+    vpsrlq    \$52, $_R2,   $T2
+    vpsrlq    \$52, $_R2h,  $T2h
+    vpsrlq    \$52, $_R3,   $T3
+    vpsrlq    \$52, $_R3h,  $T3h
+    vpsrlq    \$52, $_R4,   $T4
+    vpsrlq    \$52, $_R4h,  $T4h
+
+    # "Shift left" T0..Tn by 1 QW
+    valignq \$3, $T4,  $T4h,  $T4h
+    valignq \$3, $T3h,  $T4,  $T4
+    valignq \$3, $T3,  $T3h,  $T3h
+    valignq \$3, $T2h,  $T3,  $T3
+    valignq \$3, $T2,  $T2h,  $T2h
+    valignq \$3, $T1h,  $T2,  $T2
+    valignq \$3, $T1,   $T1h, $T1h
+    valignq \$3, $T0h,  $T1,  $T1
+    valignq \$3, $T0,   $T0h, $T0h
+    valignq \$3, .Lzeros(%rip), $T0,  $T0
+
+    # Drop "carries" from R0..Rn QWs
+    vpandq    .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq    .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq    .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq    .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq    .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq    .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq    .Lmask52x4(%rip), $_R3,  $_R3
+    vpandq    .Lmask52x4(%rip), $_R3h, $_R3h
+    vpandq    .Lmask52x4(%rip), $_R4,  $_R4
+    vpandq    .Lmask52x4(%rip), $_R4h, $_R4h
+
+    # Sum R0..Rn with corresponding adjusted carries
+    vpaddq  $T0,  $_R0,  $_R0
+    vpaddq  $T0h, $_R0h, $_R0h
+    vpaddq  $T1,  $_R1,  $_R1
+    vpaddq  $T1h, $_R1h, $_R1h
+    vpaddq  $T2,  $_R2,  $_R2
+    vpaddq  $T2h, $_R2h, $_R2h
+    vpaddq  $T3,  $_R3,  $_R3
+    vpaddq  $T3h, $_R3h, $_R3h
+    vpaddq  $T4,  $_R4,  $_R4
+    vpaddq  $T4h, $_R4h, $_R4h
+
+    # Now handle carry bits from this addition
+    # Get mask of QWs whose 52-bit parts overflow
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0},%k1    # OP=nle (i.e. gt)
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r14d
+    kmovb      %k2,%r13d
+    shl        \$4,%r13b
+    or         %r13b,%r14b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r13d
+    kmovb      %k2,%r12d
+    shl        \$4,%r12b
+    or         %r12b,%r13b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%r12d
+    kmovb      %k2,%r11d
+    shl        \$4,%r11b
+    or         %r11b,%r12b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%r11d
+    kmovb      %k2,%r10d
+    shl        \$4,%r10b
+    or         %r10b,%r11b
+
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R4},%k1
+    vpcmpuq    \$6,.Lmask52x4(%rip),${_R4h},%k2
+    kmovb      %k1,%r10d
+    kmovb      %k2,%r9d
+    shl        \$4,%r9b
+    or         %r9b,%r10b
+
+    addb       %r14b,%r14b
+    adcb       %r13b,%r13b
+    adcb       %r12b,%r12b
+    adcb       %r11b,%r11b
+    adcb       %r10b,%r10b
+
+    # Get mask of QWs whose 52-bit parts saturated
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0},%k1    # OP=eq
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R0h},%k2
+    kmovb      %k1,%r9d
+    kmovb      %k2,%r8d
+    shl        \$4,%r8b
+    or         %r8b,%r9b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R1h},%k2
+    kmovb      %k1,%r8d
+    kmovb      %k2,%edx
+    shl        \$4,%dl
+    or         %dl,%r8b
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R2h},%k2
+    kmovb      %k1,%edx
+    kmovb      %k2,%ecx
+    shl        \$4,%cl
+    or         %cl,%dl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R3h},%k2
+    kmovb      %k1,%ecx
+    kmovb      %k2,%ebx
+    shl        \$4,%bl
+    or         %bl,%cl
+
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R4},%k1
+    vpcmpuq    \$0,.Lmask52x4(%rip),${_R4h},%k2
+    kmovb      %k1,%ebx
+    kmovb      %k2,%eax
+    shl        \$4,%al
+    or         %al,%bl
+
+    addb     %r9b,%r14b
+    adcb     %r8b,%r13b
+    adcb     %dl,%r12b
+    adcb     %cl,%r11b
+    adcb     %bl,%r10b
+
+    xor      %r9b,%r14b
+    xor      %r8b,%r13b
+    xor      %dl,%r12b
+    xor      %cl,%r11b
+    xor      %bl,%r10b
+
+    kmovb    %r14d,%k1
+    shr      \$4,%r14b
+    kmovb    %r14d,%k2
+    kmovb    %r13d,%k3
+    shr      \$4,%r13b
+    kmovb    %r13d,%k4
+    kmovb    %r12d,%k5
+    shr      \$4,%r12b
+    kmovb    %r12d,%k6
+    kmovb    %r11d,%k7
+
+    vpsubq  .Lmask52x4(%rip), $_R0,  ${_R0}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R1,  ${_R1}{%k3}
+    vpsubq  .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
+    vpsubq  .Lmask52x4(%rip), $_R2,  ${_R2}{%k5}
+    vpsubq  .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
+    vpsubq  .Lmask52x4(%rip), $_R3,  ${_R3}{%k7}
+
+    vpandq  .Lmask52x4(%rip), $_R0,  $_R0
+    vpandq  .Lmask52x4(%rip), $_R0h, $_R0h
+    vpandq  .Lmask52x4(%rip), $_R1,  $_R1
+    vpandq  .Lmask52x4(%rip), $_R1h, $_R1h
+    vpandq  .Lmask52x4(%rip), $_R2,  $_R2
+    vpandq  .Lmask52x4(%rip), $_R2h, $_R2h
+    vpandq  .Lmask52x4(%rip), $_R3,  $_R3
+
+    shr    \$4,%r11b
+    kmovb   %r11d,%k1
+    kmovb   %r10d,%k2
+    shr    \$4,%r10b
+    kmovb   %r10d,%k3
+
+    vpsubq  .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
+    vpsubq  .Lmask52x4(%rip), $_R4,  ${_R4}{%k2}
+    vpsubq  .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3}
+
+    vpandq  .Lmask52x4(%rip), $_R3h, $_R3h
+    vpandq  .Lmask52x4(%rip), $_R4,  $_R4
+    vpandq  .Lmask52x4(%rip), $_R4h, $_R4h
+___
+}
+
+$code.=<<___;
+#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
+.text
+
+.globl  rsaz_amm52x40_x1_ifma256
+.type   rsaz_amm52x40_x1_ifma256,\@function,5
+.align 32
+rsaz_amm52x40_x1_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp                 # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
+    vmovdqa64   %xmm6, `0*16`(%rsp)         # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x40_x1_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+    vmovdqa64   $zero, $R4_0
+    vmovdqa64   $zero, $R4_0h
+
+    xorl    $acc0_0_low, $acc0_0_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    # Loop over 40 digits unrolled by 4
+    mov     \$10, $iter
+
+.align 32
+.Lloop10:
+___
+    foreach my $idx (0..3) {
+        &amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0);
+    }
+$code.=<<___;
+    lea    `4*8`($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop10
+___
+    &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+    vmovdqu64   $R4_0,  `8*32`($res)
+    vmovdqu64   $R4_0h, `9*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea  168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp       # restore rsp
+.cfi_def_cfa %rsp,8
+.Lrsaz_amm52x40_x1_ifma256_epilogue:
+
+    ret
+.cfi_endproc
+.size   rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
+___
+
+$code.=<<___;
+.section .rodata
+.align 32
+.Lmask52x4:
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+    .quad   0xfffffffffffff
+.text
+___
+
+###############################################################################
+# void rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
+#                               const BN_ULONG a[2][40],
+#                               const BN_ULONG b[2][40],
+#                               const BN_ULONG m[2][40],
+#                               const BN_ULONG k0[2]);
+###############################################################################
+
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x40_x2_ifma256
+.type   rsaz_amm52x40_x2_ifma256,\@function,5
+.align 32
+rsaz_amm52x40_x2_ifma256:
+.cfi_startproc
+    endbranch
+    push    %rbx
+.cfi_push   %rbx
+    push    %rbp
+.cfi_push   %rbp
+    push    %r12
+.cfi_push   %r12
+    push    %r13
+.cfi_push   %r13
+    push    %r14
+.cfi_push   %r14
+    push    %r15
+.cfi_push   %r15
+___
+$code.=<<___ if ($win64);
+    lea     -168(%rsp),%rsp
+    vmovdqa64   %xmm6, `0*16`(%rsp)        # save non-volatile registers
+    vmovdqa64   %xmm7, `1*16`(%rsp)
+    vmovdqa64   %xmm8, `2*16`(%rsp)
+    vmovdqa64   %xmm9, `3*16`(%rsp)
+    vmovdqa64   %xmm10,`4*16`(%rsp)
+    vmovdqa64   %xmm11,`5*16`(%rsp)
+    vmovdqa64   %xmm12,`6*16`(%rsp)
+    vmovdqa64   %xmm13,`7*16`(%rsp)
+    vmovdqa64   %xmm14,`8*16`(%rsp)
+    vmovdqa64   %xmm15,`9*16`(%rsp)
+.Lrsaz_amm52x40_x2_ifma256_body:
+___
+$code.=<<___;
+    # Zeroing accumulators
+    vpxord   $zero, $zero, $zero
+    vmovdqa64   $zero, $R0_0
+    vmovdqa64   $zero, $R0_0h
+    vmovdqa64   $zero, $R1_0
+    vmovdqa64   $zero, $R1_0h
+    vmovdqa64   $zero, $R2_0
+    vmovdqa64   $zero, $R2_0h
+    vmovdqa64   $zero, $R3_0
+    vmovdqa64   $zero, $R3_0h
+    vmovdqa64   $zero, $R4_0
+    vmovdqa64   $zero, $R4_0h
+
+    vmovdqa64   $zero, $R0_1
+    vmovdqa64   $zero, $R0_1h
+    vmovdqa64   $zero, $R1_1
+    vmovdqa64   $zero, $R1_1h
+    vmovdqa64   $zero, $R2_1
+    vmovdqa64   $zero, $R2_1h
+    vmovdqa64   $zero, $R3_1
+    vmovdqa64   $zero, $R3_1h
+    vmovdqa64   $zero, $R4_1
+    vmovdqa64   $zero, $R4_1h
+
+
+    xorl    $acc0_0_low, $acc0_0_low
+    xorl    $acc0_1_low, $acc0_1_low
+
+    movq    $b, $b_ptr                       # backup address of b
+    movq    \$0xfffffffffffff, $mask52       # 52-bit mask
+
+    mov    \$40, $iter
+
+.align 32
+.Lloop40:
+___
+    &amm52x40_x1(   0,   0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)");
+    # 40*8 = offset of the next dimension in two-dimension array
+    &amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)");
+$code.=<<___;
+    lea    8($b_ptr), $b_ptr
+    dec    $iter
+    jne    .Lloop40
+___
+    &amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
+    &amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h);
+$code.=<<___;
+
+    vmovdqu64   $R0_0,  `0*32`($res)
+    vmovdqu64   $R0_0h, `1*32`($res)
+    vmovdqu64   $R1_0,  `2*32`($res)
+    vmovdqu64   $R1_0h, `3*32`($res)
+    vmovdqu64   $R2_0,  `4*32`($res)
+    vmovdqu64   $R2_0h, `5*32`($res)
+    vmovdqu64   $R3_0,  `6*32`($res)
+    vmovdqu64   $R3_0h, `7*32`($res)
+    vmovdqu64   $R4_0,  `8*32`($res)
+    vmovdqu64   $R4_0h, `9*32`($res)
+
+    vmovdqu64   $R0_1,  `10*32`($res)
+    vmovdqu64   $R0_1h, `11*32`($res)
+    vmovdqu64   $R1_1,  `12*32`($res)
+    vmovdqu64   $R1_1h, `13*32`($res)
+    vmovdqu64   $R2_1,  `14*32`($res)
+    vmovdqu64   $R2_1h, `15*32`($res)
+    vmovdqu64   $R3_1,  `16*32`($res)
+    vmovdqu64   $R3_1h, `17*32`($res)
+    vmovdqu64   $R4_1,  `18*32`($res)
+    vmovdqu64   $R4_1h, `19*32`($res)
+
+    vzeroupper
+    lea     (%rsp),%rax
+.cfi_def_cfa_register   %rax
+___
+$code.=<<___ if ($win64);
+    vmovdqa64   `0*16`(%rax),%xmm6
+    vmovdqa64   `1*16`(%rax),%xmm7
+    vmovdqa64   `2*16`(%rax),%xmm8
+    vmovdqa64   `3*16`(%rax),%xmm9
+    vmovdqa64   `4*16`(%rax),%xmm10
+    vmovdqa64   `5*16`(%rax),%xmm11
+    vmovdqa64   `6*16`(%rax),%xmm12
+    vmovdqa64   `7*16`(%rax),%xmm13
+    vmovdqa64   `8*16`(%rax),%xmm14
+    vmovdqa64   `9*16`(%rax),%xmm15
+    lea     168(%rsp),%rax
+___
+$code.=<<___;
+    mov  0(%rax),%r15
+.cfi_restore    %r15
+    mov  8(%rax),%r14
+.cfi_restore    %r14
+    mov  16(%rax),%r13
+.cfi_restore    %r13
+    mov  24(%rax),%r12
+.cfi_restore    %r12
+    mov  32(%rax),%rbp
+.cfi_restore    %rbp
+    mov  40(%rax),%rbx
+.cfi_restore    %rbx
+    lea  48(%rax),%rsp
+.cfi_def_cfa    %rsp,8
+.Lrsaz_amm52x40_x2_ifma256_epilogue:
+    ret
+.cfi_endproc
+.size   rsaz_amm52x40_x2_ifma256, .-rsaz_amm52x40_x2_ifma256
+___
+}
+
+###############################################################################
+# void extract_multiplier_2x40_win5(BN_ULONG *red_Y,
+#                                   const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
+#                                   int red_table_idx1, int red_table_idx2);
+#
+###############################################################################
+{
+# input parameters
+my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                                        ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+
+my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
+my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
+my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
+
+my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
+my $t0xmm = $t0;
+$t0xmm =~ s/%y/%x/;
+
+sub get_table_value_consttime() {
+my ($_idx,$_offset) = @_;
+$code.=<<___;
+    vpxorq   $cur_idx, $cur_idx, $cur_idx
+.align 32
+.Lloop_$_offset:
+    vpcmpq  \$0, $cur_idx, $_idx, %k1      # mask of (idx == cur_idx)
+___
+foreach (0..9) {
+$code.=<<___;
+    vmovdqu64  `$_offset+${_}*32`($red_tbl), $tmp   # load data from red_tbl
+    vpblendmq  $tmp, $t[$_], ${t[$_]}{%k1}          # extract data when mask is not zero
+___
+}
+$code.=<<___;
+    vpaddq  $ones, $cur_idx, $cur_idx # increment cur_idx
+    addq    \$`2*40*8`, $red_tbl
+    cmpq    $red_tbl, %rax
+    jne .Lloop_$_offset
+___
+}
+
+$code.=<<___;
+.text
+
+.align 32
+.globl  extract_multiplier_2x40_win5
+.type   extract_multiplier_2x40_win5,\@abi-omnipotent
+extract_multiplier_2x40_win5:
+.cfi_startproc
+    endbranch
+    vmovdqa64   .Lones(%rip), $ones         # broadcast ones
+    vpbroadcastq    $red_tbl_idx1, $idx1
+    vpbroadcastq    $red_tbl_idx2, $idx2
+    leaq   `(1<<5)*2*40*8`($red_tbl), %rax  # holds end of the tbl
+
+    # backup red_tbl address
+    movq    $red_tbl, %r10
+
+    # zeroing t0..n, cur_idx
+    vpxor   $t0xmm, $t0xmm, $t0xmm
+___
+foreach (1..9) {
+    $code.="vmovdqa64   $t0, $t[$_] \n";
+}
+
+&get_table_value_consttime($idx1, 0);
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `(0+$_)*32`($out) \n";
+}
+$code.="movq    %r10, $red_tbl \n";
+&get_table_value_consttime($idx2, 40*8);
+foreach (0..9) {
+    $code.="vmovdqu64   $t[$_], `(10+$_)*32`($out) \n";
+}
+$code.=<<___;
+
+    ret
+.cfi_endproc
+.size   extract_multiplier_2x40_win5, .-extract_multiplier_2x40_win5
+___
+$code.=<<___;
+.section .rodata
+.align 32
+.Lones:
+    .quad   1,1,1,1
+.Lzeros:
+    .quad   0,0,0,0
+.text
+___
+}
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern     __imp_RtlVirtualUnwind
+.type   rsaz_avx_handler,\@abi-omnipotent
+.align  16
+rsaz_avx_handler:
+    push    %rsi
+    push    %rdi
+    push    %rbx
+    push    %rbp
+    push    %r12
+    push    %r13
+    push    %r14
+    push    %r15
+    pushfq
+    sub     \$64,%rsp
+
+    mov     120($context),%rax # pull context->Rax
+    mov     248($context),%rbx # pull context->Rip
+
+    mov     8($disp),%rsi      # disp->ImageBase
+    mov     56($disp),%r11     # disp->HandlerData
+
+    mov     0(%r11),%r10d      # HandlerData[0]
+    lea     (%rsi,%r10),%r10   # prologue label
+    cmp     %r10,%rbx          # context->Rip<.Lprologue
+    jb  .Lcommon_seh_tail
+
+    mov     4(%r11),%r10d      # HandlerData[1]
+    lea     (%rsi,%r10),%r10   # epilogue label
+    cmp     %r10,%rbx          # context->Rip>=.Lepilogue
+    jae     .Lcommon_seh_tail
+
+    mov     152($context),%rax # pull context->Rsp
+
+    lea     (%rax),%rsi         # %xmm save area
+    lea     512($context),%rdi  # & context.Xmm6
+    mov     \$20,%ecx           # 10*sizeof(%xmm0)/sizeof(%rax)
+    .long   0xa548f3fc          # cld; rep movsq
+
+    lea     `48+168`(%rax),%rax
+
+    mov     -8(%rax),%rbx
+    mov     -16(%rax),%rbp
+    mov     -24(%rax),%r12
+    mov     -32(%rax),%r13
+    mov     -40(%rax),%r14
+    mov     -48(%rax),%r15
+    mov     %rbx,144($context) # restore context->Rbx
+    mov     %rbp,160($context) # restore context->Rbp
+    mov     %r12,216($context) # restore context->R12
+    mov     %r13,224($context) # restore context->R13
+    mov     %r14,232($context) # restore context->R14
+    mov     %r15,240($context) # restore context->R14
+
+.Lcommon_seh_tail:
+    mov     8(%rax),%rdi
+    mov     16(%rax),%rsi
+    mov     %rax,152($context) # restore context->Rsp
+    mov     %rsi,168($context) # restore context->Rsi
+    mov     %rdi,176($context) # restore context->Rdi
+
+    mov     40($disp),%rdi     # disp->ContextRecord
+    mov     $context,%rsi      # context
+    mov     \$154,%ecx         # sizeof(CONTEXT)
+    .long   0xa548f3fc         # cld; rep movsq
+
+    mov     $disp,%rsi
+    xor     %rcx,%rcx          # arg1, UNW_FLAG_NHANDLER
+    mov     8(%rsi),%rdx       # arg2, disp->ImageBase
+    mov     0(%rsi),%r8        # arg3, disp->ControlPc
+    mov     16(%rsi),%r9       # arg4, disp->FunctionEntry
+    mov     40(%rsi),%r10      # disp->ContextRecord
+    lea     56(%rsi),%r11      # &disp->HandlerData
+    lea     24(%rsi),%r12      # &disp->EstablisherFrame
+    mov     %r10,32(%rsp)      # arg5
+    mov     %r11,40(%rsp)      # arg6
+    mov     %r12,48(%rsp)      # arg7
+    mov     %rcx,56(%rsp)      # arg8, (NULL)
+    call    *__imp_RtlVirtualUnwind(%rip)
+
+    mov     \$1,%eax           # ExceptionContinueSearch
+    add     \$64,%rsp
+    popfq
+    pop     %r15
+    pop     %r14
+    pop     %r13
+    pop     %r12
+    pop     %rbp
+    pop     %rbx
+    pop     %rdi
+    pop     %rsi
+    ret
+.size   rsaz_avx_handler,.-rsaz_avx_handler
+
+.section    .pdata
+.align  4
+    .rva    .LSEH_begin_rsaz_amm52x40_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x40_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x40_x1_ifma256
+
+    .rva    .LSEH_begin_rsaz_amm52x40_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x40_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x40_x2_ifma256
+
+.section    .xdata
+.align  4
+.LSEH_info_rsaz_amm52x40_x1_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x40_x1_ifma256_body,.Lrsaz_amm52x40_x1_ifma256_epilogue
+.align	4
+.LSEH_info_rsaz_amm52x40_x2_ifma256:
+    .byte   9,0,0,0
+    .rva    rsaz_avx_handler
+    .rva    .Lrsaz_amm52x40_x2_ifma256_body,.Lrsaz_amm52x40_x2_ifma256_epilogue
+
+#endif
+___
+} else {
+$code.="#endif";
+}
+
+}}} else {{{                # fallback for old assembler
+$code.=<<___;
+.text
+
+.globl  rsaz_amm52x40_x1_ifma256
+.globl  rsaz_amm52x40_x2_ifma256
+.globl  extract_multiplier_2x40_win5
+.type   rsaz_amm52x40_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x40_x1_ifma256:
+rsaz_amm52x40_x2_ifma256:
+extract_multiplier_2x40_win5:
+    .byte   0x0f,0x0b    # ud2
+    ret
+.size   rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
+___
+}}}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-avx2.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/rsaz-avx2.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -0,0 +1,628 @@
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arranging 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","ebp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","ebp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("ebp","edx");		# this splits them apart modulo 4096
+
+	&and	("ebp",-64);		# align to cache line
+
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	&mov	("eax","esp");
+	&sub	("eax","ebp");
+	&and	("eax",-4096);
+	&mov	("edx","esp");		# saved stack pointer!
+	&lea	("esp",&DWP(0,"ebp","eax"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+	&jmp	(&label("page_walk_done"));
+
+&set_label("page_walk",16);
+	&lea	("esp",&DWP(-4096,"esp"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+&set_label("page_walk_done");
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"ebp");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"edx");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengths on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&add	("edx","edx");
+	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&mov	("edx",-1);
+	&xor	("edx","eax");
+	&jmp	(&label("copy"));
+
+&set_label("copy",16);				# conditional copy
+	&mov	($tp,&DWP($frame,"esp",$num,4));
+	&mov	($np,&DWP(0,$rp,$num,4));
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&and	($tp,"eax");
+	&and	($np,"edx");
+	&or	($np,$tp);
+	&mov	(&DWP(0,$rp,$num,4),$np);
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-gcc.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-gcc.c
@@ -0,0 +1,534 @@
+/* x86_64 BIGNUM accelerator version 0.1, December 2002.
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL project.
+ * SPDX-License-Identifier: Apache-2.0
+ * 
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ *    versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ *    proof-of-concept hack. As you can see it's implemented with
+ *    inline assembler, which means that you're bound to GCC and that
+ *    there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ *    why I decided to let the compiler take care of subroutine
+ *    prologue/epilogue as well as register allocation. For reference.
+ *    Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
+ *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
+ *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
+ *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
+ *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
+ *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
+ *
+ *    'apps/openssl speed rsa dsa' output with this module:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
+ *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
+ *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
+ *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
+ *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
+ *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
+ *
+ *    For the reference. IA-32 assembler implementation performs
+ *    very much like 64-bit code compiled with no-asm on the same
+ *    machine.
+ */
+
+#include <openssl/bn.h>
+
+// TODO(davidben): Get this file working on MSVC x64.
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
+    (defined(__GNUC__) || defined(__clang__))
+
+#include "../internal.h"
+
+
+#undef mul
+#undef mul_add
+
+// "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
+#define mul_add(r, a, word, carry)                                         \
+  do {                                                                     \
+    register BN_ULONG high, low;                                           \
+    __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+r"(carry), "+d"(high)                                      \
+            : "a"(low)                                                     \
+            : "cc");                                                       \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+m"(r), "+d"(high)                                          \
+            : "r"(carry)                                                   \
+            : "cc");                                                       \
+    (carry) = high;                                                        \
+  } while (0)
+
+#define mul(r, a, word, carry)                                             \
+  do {                                                                     \
+    register BN_ULONG high, low;                                           \
+    __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
+    __asm__("addq %2,%0; adcq $0,%1"                                       \
+            : "+r"(carry), "+d"(high)                                      \
+            : "a"(low)                                                     \
+            : "cc");                                                       \
+    (r) = (carry);                                                         \
+    (carry) = high;                                                        \
+  } while (0)
+#undef sqr
+#define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                          BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return (c1);
+  }
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], w, c1);
+    mul_add(rp[1], ap[1], w, c1);
+    mul_add(rp[2], ap[2], w, c1);
+    mul_add(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul_add(rp[0], ap[0], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul_add(rp[1], ap[1], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul_add(rp[2], ap[2], w, c1);
+    return c1;
+  }
+
+  return c1;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
+                      BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num == 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], w, c1);
+    mul(rp[1], ap[1], w, c1);
+    mul(rp[2], ap[2], w, c1);
+    mul(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul(rp[0], ap[0], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul(rp[1], ap[1], w, c1);
+    if (--num == 0) {
+      return c1;
+    }
+    mul(rp[2], ap[2], w, c1);
+  }
+  return c1;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
+  if (n == 0) {
+    return;
+  }
+
+  while (n & ~3) {
+    sqr(r[0], r[1], a[0]);
+    sqr(r[2], r[3], a[1]);
+    sqr(r[4], r[5], a[2]);
+    sqr(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  if (n) {
+    sqr(r[0], r[1], a[0]);
+    if (--n == 0) {
+      return;
+    }
+    sqr(r[2], r[3], a[1]);
+    if (--n == 0) {
+      return;
+    }
+    sqr(r[4], r[5], a[2]);
+  }
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n == 0) {
+    return 0;
+  }
+
+  __asm__ volatile (
+      "	subq	%0,%0		\n"  // clear carry
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:"
+      "	movq	(%4,%2,8),%0	\n"
+      "	adcq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	dec	%1		\n"
+      "	jnz	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+&c"(n), "+&r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc", "memory");
+
+  return ret & 1;
+}
+
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      size_t n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n == 0) {
+    return 0;
+  }
+
+  __asm__ volatile (
+      "	subq	%0,%0		\n"  // clear borrow
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:"
+      "	movq	(%4,%2,8),%0	\n"
+      "	sbbq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	dec	%1		\n"
+      "	jnz	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+&c"(n), "+&r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc", "memory");
+
+  return ret & 1;
+}
+
+// mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0)
+// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
+// sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0)
+// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
+
+// Keep in mind that carrying into high part of multiplication result can not
+// overflow, because it cannot be all-ones.
+#define mul_add_c(a, b, c0, c1, c2)                                  \
+  do {                                                               \
+    BN_ULONG t1, t2;                                                 \
+    __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+  } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2)                               \
+  do {                                                            \
+    BN_ULONG t1, t2;                                              \
+    __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                  \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                       \
+            : "r"(t1), "r"(t2)                                    \
+            : "cc");                                              \
+  } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2)                                 \
+  do {                                                               \
+    BN_ULONG t1, t2;                                                 \
+    __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+    __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2"                     \
+            : "+&r"(c0), "+r"(c1), "+r"(c2)                          \
+            : "r"(t1), "r"(t2)                                       \
+            : "cc");                                                 \
+  } while (0)
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[4], b[0], c2, c3, c1);
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  mul_add_c(a[0], b[4], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[0], b[5], c3, c1, c2);
+  mul_add_c(a[1], b[4], c3, c1, c2);
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  mul_add_c(a[4], b[1], c3, c1, c2);
+  mul_add_c(a[5], b[0], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[6], b[0], c1, c2, c3);
+  mul_add_c(a[5], b[1], c1, c2, c3);
+  mul_add_c(a[4], b[2], c1, c2, c3);
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  mul_add_c(a[2], b[4], c1, c2, c3);
+  mul_add_c(a[1], b[5], c1, c2, c3);
+  mul_add_c(a[0], b[6], c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[7], c2, c3, c1);
+  mul_add_c(a[1], b[6], c2, c3, c1);
+  mul_add_c(a[2], b[5], c2, c3, c1);
+  mul_add_c(a[3], b[4], c2, c3, c1);
+  mul_add_c(a[4], b[3], c2, c3, c1);
+  mul_add_c(a[5], b[2], c2, c3, c1);
+  mul_add_c(a[6], b[1], c2, c3, c1);
+  mul_add_c(a[7], b[0], c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[1], c3, c1, c2);
+  mul_add_c(a[6], b[2], c3, c1, c2);
+  mul_add_c(a[5], b[3], c3, c1, c2);
+  mul_add_c(a[4], b[4], c3, c1, c2);
+  mul_add_c(a[3], b[5], c3, c1, c2);
+  mul_add_c(a[2], b[6], c3, c1, c2);
+  mul_add_c(a[1], b[7], c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  mul_add_c(a[2], b[7], c1, c2, c3);
+  mul_add_c(a[3], b[6], c1, c2, c3);
+  mul_add_c(a[4], b[5], c1, c2, c3);
+  mul_add_c(a[5], b[4], c1, c2, c3);
+  mul_add_c(a[6], b[3], c1, c2, c3);
+  mul_add_c(a[7], b[2], c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  mul_add_c(a[7], b[3], c2, c3, c1);
+  mul_add_c(a[6], b[4], c2, c3, c1);
+  mul_add_c(a[5], b[5], c2, c3, c1);
+  mul_add_c(a[4], b[6], c2, c3, c1);
+  mul_add_c(a[3], b[7], c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  mul_add_c(a[4], b[7], c3, c1, c2);
+  mul_add_c(a[5], b[6], c3, c1, c2);
+  mul_add_c(a[6], b[5], c3, c1, c2);
+  mul_add_c(a[7], b[4], c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  mul_add_c(a[7], b[5], c1, c2, c3);
+  mul_add_c(a[6], b[6], c1, c2, c3);
+  mul_add_c(a[5], b[7], c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  mul_add_c(a[6], b[7], c2, c3, c1);
+  mul_add_c(a[7], b[6], c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[7], c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  sqr_add_c2(a, 4, 0, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 5, 0, c3, c1, c2);
+  sqr_add_c2(a, 4, 1, c3, c1, c2);
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  sqr_add_c2(a, 4, 2, c1, c2, c3);
+  sqr_add_c2(a, 5, 1, c1, c2, c3);
+  sqr_add_c2(a, 6, 0, c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 0, c2, c3, c1);
+  sqr_add_c2(a, 6, 1, c2, c3, c1);
+  sqr_add_c2(a, 5, 2, c2, c3, c1);
+  sqr_add_c2(a, 4, 3, c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  sqr_add_c(a, 4, c3, c1, c2);
+  sqr_add_c2(a, 5, 3, c3, c1, c2);
+  sqr_add_c2(a, 6, 2, c3, c1, c2);
+  sqr_add_c2(a, 7, 1, c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 7, 2, c1, c2, c3);
+  sqr_add_c2(a, 6, 3, c1, c2, c3);
+  sqr_add_c2(a, 5, 4, c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  sqr_add_c(a, 5, c2, c3, c1);
+  sqr_add_c2(a, 6, 4, c2, c3, c1);
+  sqr_add_c2(a, 7, 3, c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 7, 4, c3, c1, c2);
+  sqr_add_c2(a, 6, 5, c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  sqr_add_c(a, 6, c1, c2, c3);
+  sqr_add_c2(a, 7, 5, c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 6, c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  sqr_add_c(a, 7, c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+#undef mul_add
+#undef mul
+#undef sqr
+#undef mul_add_c
+#undef sqr_add_c
+#undef mul_add_c2
+#undef sqr_add_c2
+
+#endif  // !NO_ASM && X86_64 && (__GNUC__ || __clang__)
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/bn/asm/x86_64-mont5.pl