chore: checkpoint before Python removal

2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-unroll8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8-unroll8.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/aesv8-gcm-armv8.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-armv4.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-armv4.pl
@@ -0,0 +1,298 @@
+#! /usr/bin/env perl
+# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
+# Snapdragon S4 - in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+# This file was patched in BoringSSL to remove the variable-time 4-bit
+# implementation.
+
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	vld1.64		$IN#hi,[r1]!		@ load H
+	vmov.i8		$t0,#0xe1
+	vld1.64		$IN#lo,[r1]
+	vshl.i64	$t0#hi,#57
+	vshr.u64	$t0#lo,#63		@ t0=0xc2....01
+	vdup.8		$t1,$IN#hi[7]
+	vshr.u64	$Hlo,$IN#lo,#63
+	vshr.s8		$t1,#7			@ broadcast carry bit
+	vshl.i64	$IN,$IN,#1
+	vand		$t0,$t0,$t1
+	vorr		$IN#hi,$Hlo		@ H<<<=1
+	veor		$IN,$IN,$t0		@ twisted H
+	vstmia		r0,{$IN}
+
+	ret					@ bx lr
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	vld1.64		$IN#hi,[$Xi]!		@ load Xi
+	vld1.64		$IN#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+	mov		$len,#16
+	b		.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		$Xl#hi,[$Xi]!		@ load Xi
+	vld1.64		$Xl#lo,[$Xi]!
+	vmov.i64	$k48,#0x0000ffffffffffff
+	vldmia		$Htbl,{$Hlo-$Hhi}	@ load twisted H
+	vmov.i64	$k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	vmov.i64	$k16,#0x000000000000ffff
+	veor		$Hhl,$Hlo,$Hhi		@ Karatsuba pre-processing
+
+.Loop_neon:
+	vld1.64		$IN#hi,[$inp]!		@ load inp
+	vld1.64		$IN#lo,[$inp]!
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$IN,$Xl			@ inp^=Xi
+.Lgmult_neon:
+___
+	&clmul64x64	($Xl,$Hlo,"$IN#lo");	# H.lo·Xi.lo
+$code.=<<___;
+	veor		$IN#lo,$IN#lo,$IN#hi	@ Karatsuba pre-processing
+___
+	&clmul64x64	($Xm,$Hhl,"$IN#lo");	# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+	&clmul64x64	($Xh,$Hhi,"$IN#hi");	# H.hi·Xi.hi
+$code.=<<___;
+	veor		$Xm,$Xm,$Xl		@ Karatsuba post-processing
+	veor		$Xm,$Xm,$Xh
+	veor		$Xl#hi,$Xl#hi,$Xm#lo
+	veor		$Xh#lo,$Xh#lo,$Xm#hi	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	$t1,$Xl,#57		@ 1st phase
+	vshl.i64	$t2,$Xl,#62
+	veor		$t2,$t2,$t1		@
+	vshl.i64	$t1,$Xl,#63
+	veor		$t2, $t2, $t1		@
+ 	veor		$Xl#hi,$Xl#hi,$t2#lo	@
+	veor		$Xh#lo,$Xh#lo,$t2#hi
+
+	vshr.u64	$t2,$Xl,#1		@ 2nd phase
+	veor		$Xh,$Xh,$Xl
+	veor		$Xl,$Xl,$t2		@
+	vshr.u64	$t2,$t2,#6
+	vshr.u64	$Xl,$Xl,#1		@
+	veor		$Xl,$Xl,$Xh		@
+	veor		$Xl,$Xl,$t2		@
+
+	subs		$len,#16
+	bne		.Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Xl,$Xl
+#endif
+	sub		$Xi,#16
+	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
+	vst1.64		$Xl#lo,[$Xi]
+
+	ret					@ bx lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl
@@ -0,0 +1,290 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
+# implements the multiplication algorithm described in:
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+#
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+#
+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
+# NEON, the low and high halves of the 128-bit register q0 are accessible as
+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
+# vN. Where the 32-bit version would use the upper half, this file must keep
+# halves in separate registers.
+#
+# The other distinction is in syntax. 32-bit NEON embeds lane information in the
+# instruction name, while AArch64 uses suffixes on the registers. For instance,
+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
+#
+#     vshl.i64 q0, q0, #1
+#
+# in 64-bit, it would be written:
+#
+#     shl v0.2d, v0.2d, #1
+#
+# See Programmer's Guide for ARMv8-A, section 7 for details.
+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
+#
+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
+# polynomial and is conditioned on the PMULL extension. This file emulates the
+# latter with the former.
+
+use strict;
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+my $flavour = shift;
+my $output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/;
+    my $dir = $1;
+    my $xlate;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3));	# argument block
+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
+# to spare.
+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
+my ($k48_k32, $k16_k0) = map("v$_", (24..25));
+
+my $code = "";
+
+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
+sub clmul64x64 {
+my ($r, $a, $b) = @_;
+$code .= <<___;
+	ext	$t0.8b, $a.8b, $a.8b, #1	// A1
+	pmull	$t0.8h, $t0.8b, $b.8b		// F = A1*B
+	ext	$r.8b, $b.8b, $b.8b, #1		// B1
+	pmull	$r.8h, $a.8b, $r.8b		// E = A*B1
+	ext	$t1.8b, $a.8b, $a.8b, #2	// A2
+	pmull	$t1.8h, $t1.8b, $b.8b		// H = A2*B
+	ext	$t3.8b, $b.8b, $b.8b, #2	// B2
+	pmull	$t3.8h, $a.8b, $t3.8b		// G = A*B2
+	ext	$t2.8b, $a.8b, $a.8b, #3	// A3
+	eor	$t0.16b, $t0.16b, $r.16b	// L = E + F
+	pmull	$t2.8h, $t2.8b, $b.8b		// J = A3*B
+	ext	$r.8b, $b.8b, $b.8b, #3		// B3
+	eor	$t1.16b, $t1.16b, $t3.16b	// M = G + H
+	pmull	$r.8h, $a.8b, $r.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	\$t0#hi, \$t0#hi, \$k48
+	//     veor	\$t0#lo, \$t0#lo, \$t0#hi
+	//
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	\$t1#hi, \$t1#hi, \$k32
+	//     veor	\$t1#lo, \$t1#lo, \$t1#hi
+	//
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	\$t2#hi, \$t2#hi, \$k16
+	//     veor	\$t2#lo, \$t2#lo, \$t2#hi
+	//
+	//     veor	\$t3#lo, \$t3#lo, \$t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	\$t3#hi, #0
+	//
+	// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	$t3.8b, $b.8b, $b.8b, #4	// B4
+	eor	$t2.16b, $t2.16b, $r.16b	// N = I + J
+	pmull	$t3.8h, $a.8b, $t3.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	$t0l_t1l.2d, $t0.2d, $t1.2d
+	zip1	$t2l_t3l.2d, $t2.2d, $t3.2d
+	zip2	$t0h_t1h.2d, $t0.2d, $t1.2d
+	zip2	$t2h_t3h.2d, $t2.2d, $t3.2d
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	and	$t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
+	and	$t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
+	eor	$t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
+	eor	$t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
+	zip1	$t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip1	$t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
+	zip2	$t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
+	zip2	$t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
+
+	ext	$t0.16b, $t0.16b, $t0.16b, #15	// t0 = t0 << 8
+	ext	$t1.16b, $t1.16b, $t1.16b, #14	// t1 = t1 << 16
+	pmull	$r.8h, $a.8b, $b.8b		// D = A*B
+	ext	$t3.16b, $t3.16b, $t3.16b, #12	// t3 = t3 << 32
+	ext	$t2.16b, $t2.16b, $t2.16b, #13	// t2 = t2 << 24
+	eor	$t0.16b, $t0.16b, $t1.16b
+	eor	$t2.16b, $t2.16b, $t3.16b
+	eor	$r.16b, $r.16b, $t0.16b
+	eor	$r.16b, $r.16b, $t2.16b
+___
+}
+
+$code .= <<___;
+#include <openssl/arm_arch.h>
+
+.text
+
+.global	gcm_init_neon
+.type	gcm_init_neon,%function
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{$t1.2d}, [x1]			// load H
+	movi	$t3.16b, #0xe1
+	shl	$t3.2d, $t3.2d, #57		// 0xc2.0
+	ext	$INlo.16b, $t1.16b, $t1.16b, #8
+	ushr	$t2.2d, $t3.2d, #63
+	dup	$t1.4s, $t1.s[1]
+	ext	$t0.16b, $t2.16b, $t3.16b, #8	// t0=0xc2....01
+	ushr	$t2.2d, $INlo.2d, #63
+	sshr	$t1.4s, $t1.4s, #31		// broadcast carry bit
+	and	$t2.16b, $t2.16b, $t0.16b
+	shl	$INlo.2d, $INlo.2d, #1
+	ext	$t2.16b, $t2.16b, $t2.16b, #8
+	and	$t0.16b, $t0.16b, $t1.16b
+	orr	$INlo.16b, $INlo.16b, $t2.16b	// H<<<=1
+	eor	$Hlo.16b, $INlo.16b, $t0.16b	// twisted H
+	st1	{$Hlo.2d}, [x0]			// store Htable[0]
+	ret
+.size	gcm_init_neon,.-gcm_init_neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$INlo.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$INlo.16b, $INlo.16b		// byteswap Xi
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+	mov	$len, #16
+	b	.Lgmult_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{$Xl.16b}, [$Xi]		// load Xi
+	ld1	{$Hlo.1d}, [$Htbl], #8		// load twisted H
+	ld1	{$Hhi.1d}, [$Htbl]
+	adrp	x9, :pg_hi21:.Lmasks		// load constants
+	add	x9, x9, :lo12:.Lmasks
+	ld1	{$k48_k32.2d, $k16_k0.2d}, [x9]
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	eor	$Hhl.8b, $Hlo.8b, $Hhi.8b	// Karatsuba pre-processing
+
+.Loop_neon:
+	ld1	{$INlo.16b}, [$inp], #16	// load inp
+	rev64	$INlo.16b, $INlo.16b		// byteswap inp
+	ext	$INlo.16b, $INlo.16b, $INlo.16b, #8
+	eor	$INlo.16b, $INlo.16b, $Xl.16b	// inp ^= Xi
+
+.Lgmult_neon:
+	// Split the input into $INlo and $INhi. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	$INhi.d[0], $INlo.d[1]
+___
+&clmul64x64	($Xl, $Hlo, $INlo);		# H.lo·Xi.lo
+$code .= <<___;
+	eor	$INlo.8b, $INlo.8b, $INhi.8b	// Karatsuba pre-processing
+___
+&clmul64x64	($Xm, $Hhl, $INlo);		# (H.lo+H.hi)·(Xi.lo+Xi.hi)
+&clmul64x64	($Xh, $Hhi, $INhi);		# H.hi·Xi.hi
+$code .= <<___;
+	ext	$t0.16b, $Xl.16b, $Xh.16b, #8
+	eor	$Xm.16b, $Xm.16b, $Xl.16b	// Karatsuba post-processing
+	eor	$Xm.16b, $Xm.16b, $Xh.16b
+	eor	$Xm.16b, $Xm.16b, $t0.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	$Xl.d[1], $Xm.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	$Xh.d[0], $Xm.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	$t1.2d, $Xl.2d, #57		// 1st phase
+	shl	$t2.2d, $Xl.2d, #62
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	shl	$t1.2d, $Xl.2d, #63
+	eor	$t2.16b, $t2.16b, $t1.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	$t2.16b, $t2.16b, $Xm.16b
+	ins	$Xl.d[1], $t2.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	$Xh.d[0], $t2.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	$t2.2d, $Xl.2d, #1		// 2nd phase
+	eor	$Xh.16b, $Xh.16b,$Xl.16b
+	eor	$Xl.16b, $Xl.16b,$t2.16b	//
+	ushr	$t2.2d, $t2.2d, #6
+	ushr	$Xl.2d, $Xl.2d, #1		//
+	eor	$Xl.16b, $Xl.16b, $Xh.16b	//
+	eor	$Xl.16b, $Xl.16b, $t2.16b	//
+
+	subs	$len, $len, #16
+	bne	.Loop_neon
+
+	rev64	$Xl.16b, $Xl.16b		// byteswap Xi and write
+	ext	$Xl.16b, $Xl.16b, $Xl.16b, #8
+	st1	{$Xl.16b}, [$Xi]
+
+	ret
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+
+.section	.rodata
+.align	4
+.Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.asciz  "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	print $_,"\n";
+}
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
@@ -0,0 +1,281 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+# SPDX-License-Identifier: ISC
+
+# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
+# table-based GHASH implementation. It requires SSSE3 instructions.
+#
+# For background, the table-based strategy is a 4-bit windowed multiplication.
+# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
+# over 4-bit windows of the input and indexes them up into the table. Visually,
+# it multiplies as in the schoolbook multiplication diagram below, but with
+# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
+# it incorporates the terms labeled '1' by indexing the most significant term
+# of X into the table. Then it shifts and repeats for '2' and so on.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        666666
+#       555555
+#      444444
+#     333333
+#    222222
+#   111111
+#
+# This implementation changes the order. We treat the table as a 16×16 matrix
+# and transpose it. The first row is then the first byte of each multiple of H,
+# and so on. We then reorder terms as below. Observe that the terms labeled '1'
+# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
+# pshufb instruction, using alternating terms of X in parallel as indices. This
+# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
+# repeat for each row.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        224466
+#       113355
+#      224466
+#     113355
+#    224466
+#   113355
+#
+# Next we account for GCM's confusing bit order. The "first" bit is the least
+# significant coefficient, but GCM treats the most sigificant bit within a byte
+# as first. Bytes are little-endian, and bits are big-endian. We reverse the
+# bytes in XMM registers for a consistent bit and byte ordering, but this means
+# the least significant bit is the most significant coefficient and vice versa.
+#
+# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
+# bit ordering within the XMM register, rather than the reversed coefficient
+# ordering. Low bits are less significant bits and more significant
+# coefficients. Right-shifts move from MSB to the LSB and correspond to
+# increasing the power of each coefficient.
+#
+# Note this bit reversal enters into the table's column indices. H*1 is stored
+# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
+# table rows contain more significant coefficients, so we iterate forwards.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output = $ARGV[1];
+open STDOUT, ">$output";
+
+&asm_init($ARGV[0]);
+
+my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
+&static_label("reverse_bytes");
+&static_label("low4_mask");
+
+my $call_counter = 0;
+# process_rows emits assembly code to process $rows rows of the table. On
+# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
+# low and high halves of the input. The result so far is passed in xmm2. xmm3
+# must be zero. On output, $Htable is advanced to the next row and xmm2 is
+# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
+sub process_rows {
+	my ($rows) = @_;
+	$call_counter++;
+
+	# Shifting whole XMM registers by bits is complex. psrldq shifts by
+	# bytes, and psrlq shifts the two 64-bit halves separately. Each row
+	# produces 8 bits of carry, and the reduction needs an additional 7-bit
+	# shift. This must fit in 64 bits so reduction can use psrlq. This
+	# allows up to 7 rows at a time.
+	die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
+
+	&mov("eax", $rows);
+&set_label("loop_row_$call_counter");
+	&movdqa("xmm4", &QWP(0, $Htable));
+	&lea($Htable, &DWP(16, $Htable));
+
+	# Right-shift xmm2 and xmm3 by 8 bytes.
+	&movdqa("xmm6", "xmm2");
+	&palignr("xmm6", "xmm3", 1);
+	&movdqa("xmm3", "xmm6");
+	&psrldq("xmm2", 1);
+
+	# Load the next table row and index the low and high bits of the input.
+	# Note the low (respectively, high) half corresponds to more
+	# (respectively, less) significant coefficients.
+	&movdqa("xmm5", "xmm4");
+	&pshufb("xmm4", "xmm0");
+	&pshufb("xmm5", "xmm1");
+
+	# Add the high half (xmm5) without shifting.
+	&pxor("xmm2", "xmm5");
+
+	# Add the low half (xmm4). This must be right-shifted by 4 bits. First,
+	# add into the carry register (xmm3).
+	&movdqa("xmm5", "xmm4");
+	&psllq("xmm5", 60);
+	&movdqa("xmm6", "xmm5");
+	&pslldq("xmm6", 8);
+	&pxor("xmm3", "xmm6");
+
+	# Next, add into xmm2.
+	&psrldq("xmm5", 8);
+	&pxor("xmm2", "xmm5");
+	&psrlq("xmm4", 4);
+	&pxor("xmm2", "xmm4");
+
+	&sub("eax", 1);
+	&jnz(&label("loop_row_$call_counter"));
+
+	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
+	# x^7, so we shift and XOR four times.
+	&pxor("xmm2", "xmm3");	# x^0 = 0
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^1 = x
+	&psrlq("xmm3", 1);
+	&pxor("xmm2", "xmm3");	# x^(1+1) = x^2
+	&psrlq("xmm3", 5);
+	&pxor("xmm2", "xmm3");	# x^(1+1+5) = x^7
+	&pxor("xmm3", "xmm3");
+____
+}
+
+# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
+# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
+# formatted as described above.
+# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+&function_begin("gcm_gmult_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("eax");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
+
+	# Reverse input bytes to deserialize.
+	&pshufb("xmm0", "xmm7");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	# Store the result. Reverse bytes to serialize.
+	&pshufb("xmm2", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm2");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_gmult_ssse3");
+
+# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
+# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
+# serialized byte representation. |Htable| is formatted as described above.
+# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+#                      size_t len);
+&function_begin("gcm_ghash_ssse3");
+	&mov($Xi, &wparam(0));
+	&mov($Htable, &wparam(1));
+	&mov($in, &wparam(2));
+	&mov($len, &wparam(3));
+
+	&movdqu("xmm0", &QWP(0, $Xi));
+	&call(&label("pic_point"));
+&set_label("pic_point");
+	&blindpop("ebx");
+	&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
+
+	# This function only processes whole blocks.
+	&and($len, -16);
+
+	# Reverse input bytes to deserialize. We maintain the running
+	# total in xmm0.
+	&pshufb("xmm0", "xmm7");
+
+	# Iterate over each block. On entry to each iteration, xmm3 is zero.
+	&pxor("xmm3", "xmm3");
+&set_label("loop_ghash");
+	&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
+
+	# Incorporate the next block of input.
+	&movdqu("xmm1", &QWP(0, $in));
+	&pshufb("xmm1", "xmm7");	# Reverse bytes.
+	&pxor("xmm0", "xmm1");
+
+	# Split each byte into low (xmm0) and high (xmm1) halves.
+	&movdqa("xmm1", "xmm2");
+	&pandn("xmm1", "xmm0");
+	&psrld("xmm1", 4);
+	&pand("xmm0", "xmm2");
+
+	# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
+	# that, due to bit reversal, xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	&pxor("xmm2", "xmm2");
+	# xmm3 is already zero at this point.
+
+	# We must reduce at least once every 7 rows, so divide into three
+	# chunks.
+	&process_rows(5);
+	&process_rows(5);
+	&process_rows(6);
+
+	&movdqa("xmm0", "xmm2");
+
+	# Rewind $Htable for the next iteration.
+	&lea($Htable, &DWP(-256, $Htable));
+
+	# Advance input and continue.
+	&lea($in, &DWP(16, $in));
+	&sub($len, 16);
+	&jnz(&label("loop_ghash"));
+
+	# Reverse bytes and store the result.
+	&pshufb("xmm0", "xmm7");
+	&movdqu(&QWP(0, $Xi), "xmm0");
+
+	# Zero any registers which contain secrets.
+	&pxor("xmm0", "xmm0");
+	&pxor("xmm1", "xmm1");
+	&pxor("xmm2", "xmm2");
+	&pxor("xmm3", "xmm3");
+	&pxor("xmm4", "xmm4");
+	&pxor("xmm5", "xmm5");
+	&pxor("xmm6", "xmm6");
+&function_end("gcm_ghash_ssse3");
+
+# reverse_bytes is a permutation which, if applied with pshufb, reverses the
+# bytes in an XMM register.
+&set_label("reverse_bytes", 16);
+&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+# low4_mask is an XMM mask which selects the low four bits of each byte.
+&set_label("low4_mask", 16);
+&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
@@ -0,0 +1,342 @@
+#!/usr/bin/env perl
+# Copyright (c) 2019, Google Inc.
+#
+# SPDX-License-Identifier: ISC
+
+# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
+# table-based GHASH implementation. It requires SSSE3 instructions.
+#
+# For background, the table-based strategy is a 4-bit windowed multiplication.
+# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
+# over 4-bit windows of the input and indexes them up into the table. Visually,
+# it multiplies as in the schoolbook multiplication diagram below, but with
+# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
+# it incorporates the terms labeled '1' by indexing the most significant term
+# of X into the table. Then it shifts and repeats for '2' and so on.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        666666
+#       555555
+#      444444
+#     333333
+#    222222
+#   111111
+#
+# This implementation changes the order. We treat the table as a 16×16 matrix
+# and transpose it. The first row is then the first byte of each multiple of H,
+# and so on. We then reorder terms as below. Observe that the terms labeled '1'
+# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
+# pshufb instruction, using alternating terms of X in parallel as indices. This
+# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
+# repeat for each row.
+#
+#        hhhhhh
+#  *     xxxxxx
+#  ============
+#        224466
+#       113355
+#      224466
+#     113355
+#    224466
+#   113355
+#
+# Next we account for GCM's confusing bit order. The "first" bit is the least
+# significant coefficient, but GCM treats the most sigificant bit within a byte
+# as first. Bytes are little-endian, and bits are big-endian. We reverse the
+# bytes in XMM registers for a consistent bit and byte ordering, but this means
+# the least significant bit is the most significant coefficient and vice versa.
+#
+# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
+# bit ordering within the XMM register, rather than the reversed coefficient
+# ordering. Low bits are less significant bits and more significant
+# coefficients. Right-shifts move from MSB to the LSB and correspond to
+# increasing the power of each coefficient.
+#
+# Note this bit reversal enters into the table's column indices. H*1 is stored
+# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
+# table rows contain more significant coefficients, so we iterate forwards.
+
+use strict;
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+my $flavour = shift;
+my $output  = shift;
+
+my $win64 = 0;
+$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/;
+my $dir = $1;
+my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT = *OUT;
+
+my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
+                                        ("%rdi", "%rsi", "%rdx", "%rcx");
+
+
+my $code = <<____;
+.text
+
+# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
+# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
+# formatted as described above.
+# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
+.type	gcm_gmult_ssse3, \@abi-omnipotent
+.globl	gcm_gmult_ssse3
+.align	16
+gcm_gmult_ssse3:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+____
+$code .= <<____ if ($win64);
+	subq	\$40, %rsp
+.seh_allocstack	40
+	movdqa	%xmm6, (%rsp)
+.seh_savexmm128	%xmm6, 0
+	movdqa	%xmm10, 16(%rsp)
+.seh_savexmm128	%xmm10, 16
+____
+$code .= <<____;
+	movdqu	($Xi), %xmm0
+	movdqa	.Lreverse_bytes(%rip), %xmm10
+	movdqa	.Llow4_mask(%rip), %xmm2
+
+	# Reverse input bytes to deserialize.
+	pshufb	%xmm10, %xmm0
+
+	# Split each byte into low (%xmm0) and high (%xmm1) halves.
+	movdqa	%xmm2, %xmm1
+	pandn	%xmm0, %xmm1
+	psrld	\$4, %xmm1
+	pand	%xmm2, %xmm0
+
+	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
+	# that, due to bit reversal, %xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+____
+
+my $call_counter = 0;
+# process_rows returns assembly code to process $rows rows of the table. On
+# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
+# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
+# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
+# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
+sub process_rows {
+    my ($rows) = @_;
+    $call_counter++;
+
+    # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
+    # and psrlq shifts the two 64-bit halves separately. Each row produces 8
+    # bits of carry, and the reduction needs an additional 7-bit shift. This
+    # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
+    # at a time.
+    die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
+
+    return <<____;
+	movq	\$$rows, %rax
+.Loop_row_$call_counter:
+	movdqa	($Htable), %xmm4
+	leaq	16($Htable), $Htable
+
+	# Right-shift %xmm2 and %xmm3 by 8 bytes.
+	movdqa	%xmm2, %xmm6
+	palignr	\$1, %xmm3, %xmm6
+	movdqa	%xmm6, %xmm3
+	psrldq	\$1, %xmm2
+
+	# Load the next table row and index the low and high bits of the input.
+	# Note the low (respectively, high) half corresponds to more
+	# (respectively, less) significant coefficients.
+	movdqa	%xmm4, %xmm5
+	pshufb	%xmm0, %xmm4
+	pshufb	%xmm1, %xmm5
+
+	# Add the high half (%xmm5) without shifting.
+	pxor	%xmm5, %xmm2
+
+	# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
+	# add into the carry register (%xmm3).
+	movdqa	%xmm4, %xmm5
+	psllq	\$60, %xmm5
+	movdqa	%xmm5, %xmm6
+	pslldq	\$8, %xmm6
+	pxor	%xmm6, %xmm3
+
+	# Next, add into %xmm2.
+	psrldq	\$8, %xmm5
+	pxor	%xmm5, %xmm2
+	psrlq	\$4, %xmm4
+	pxor	%xmm4, %xmm2
+
+	subq	\$1, %rax
+	jnz	.Loop_row_$call_counter
+
+	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
+	# x^7, so we shift and XOR four times.
+	pxor	%xmm3, %xmm2	# x^0 = 0
+	psrlq	\$1, %xmm3
+	pxor	%xmm3, %xmm2	# x^1 = x
+	psrlq	\$1, %xmm3
+	pxor	%xmm3, %xmm2	# x^(1+1) = x^2
+	psrlq	\$5, %xmm3
+	pxor	%xmm3, %xmm2	# x^(1+1+5) = x^7
+	pxor	%xmm3, %xmm3
+____
+}
+
+# We must reduce at least once every 7 rows, so divide into three chunks.
+$code .= process_rows(5);
+$code .= process_rows(5);
+$code .= process_rows(6);
+
+$code .= <<____;
+	# Store the result. Reverse bytes to serialize.
+	pshufb	%xmm10, %xmm2
+	movdqu	%xmm2, ($Xi)
+
+	# Zero any registers which contain secrets.
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+____
+$code .= <<____ if ($win64);
+	movdqa	(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm10
+	addq	\$40, %rsp
+____
+$code .= <<____;
+	ret
+.cfi_endproc
+.seh_endproc
+.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
+____
+
+$code .= <<____;
+# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
+# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
+# serialized byte representation. |Htable| is formatted as described above.
+# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
+#                      size_t len);
+.type	gcm_ghash_ssse3, \@abi-omnipotent
+.globl	gcm_ghash_ssse3
+.align	16
+gcm_ghash_ssse3:
+.cfi_startproc
+.seh_startproc
+	_CET_ENDBR
+____
+$code .= <<____ if ($win64);
+	subq	\$56, %rsp
+.seh_allocstack	56
+	movdqa	%xmm6, (%rsp)
+.seh_savexmm128	%xmm6, 0
+	movdqa	%xmm10, 16(%rsp)
+.seh_savexmm128	%xmm10, 16
+	movdqa	%xmm11, 32(%rsp)
+.seh_savexmm128	%xmm11, 32
+____
+$code .= <<____;
+	movdqu	($Xi), %xmm0
+	movdqa	.Lreverse_bytes(%rip), %xmm10
+	movdqa	.Llow4_mask(%rip), %xmm11
+
+	# This function only processes whole blocks.
+	andq	\$-16, $len
+
+	# Reverse input bytes to deserialize. We maintain the running
+	# total in %xmm0.
+	pshufb	%xmm10, %xmm0
+
+	# Iterate over each block. On entry to each iteration, %xmm3 is zero.
+	pxor	%xmm3, %xmm3
+.Loop_ghash:
+	# Incorporate the next block of input.
+	movdqu	($in), %xmm1
+	pshufb	%xmm10, %xmm1	# Reverse bytes.
+	pxor	%xmm1, %xmm0
+
+	# Split each byte into low (%xmm0) and high (%xmm1) halves.
+	movdqa	%xmm11, %xmm1
+	pandn	%xmm0, %xmm1
+	psrld	\$4, %xmm1
+	pand	%xmm11, %xmm0
+
+	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
+	# that, due to bit reversal, %xmm3 contains bits that fall off when
+	# right-shifting, not left-shifting.
+	pxor	%xmm2, %xmm2
+	# %xmm3 is already zero at this point.
+____
+
+# We must reduce at least once every 7 rows, so divide into three chunks.
+$code .= process_rows(5);
+$code .= process_rows(5);
+$code .= process_rows(6);
+
+$code .= <<____;
+	movdqa	%xmm2, %xmm0
+
+	# Rewind $Htable for the next iteration.
+	leaq	-256($Htable), $Htable
+
+	# Advance input and continue.
+	leaq	16($in), $in
+	subq	\$16, $len
+	jnz	.Loop_ghash
+
+	# Reverse bytes and store the result.
+	pshufb	%xmm10, %xmm0
+	movdqu	%xmm0, ($Xi)
+
+	# Zero any registers which contain secrets.
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+____
+$code .= <<____ if ($win64);
+	movdqa	(%rsp), %xmm6
+	movdqa	16(%rsp), %xmm10
+	movdqa	32(%rsp), %xmm11
+	addq	\$56, %rsp
+____
+$code .= <<____;
+	ret
+.cfi_endproc
+.seh_endproc
+.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.section .rodata
+.align	16
+# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
+# bytes in an XMM register.
+.Lreverse_bytes:
+.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
+.Llow4_mask:
+.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.text
+____
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-x86.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-x86.pl
@@ -0,0 +1,684 @@
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla SSE. Former will be executed on
+# 486 and Pentium, latter on all others. SSE GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	SSE assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with SSE results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
+# in particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be between
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed against slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
+# February 2013
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9. Resulting performance is 1.96 cycles per byte on
+# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
+
+# This file was patched in BoringSSL to remove the variable-time 4-bit
+# implementation.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=$ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$x86only=0;
+$sse2=1;
+
+if (!$x86only) {{{
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey,$HK)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110)	if (!defined($HK));
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey)		if (!defined($HK));
+			$HK=$T2			if (!defined($HK));
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$HK,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/11 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T2,$Xi);		#
+	&movdqa		($T1,$Xi);
+	&psllq		($Xi,5);
+	&pxor		($T1,$Xi);		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T1,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T1,8);		#
+	&pxor		($Xi,$T2);
+	&pxor		($Xhi,$T1);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,1);
+	&pxor		($Xhi,$T2);		#
+	&pxor		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$Xhi)		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufd		($T1,$Hkey,0b01001110);
+	&pshufd		($T2,$Xi,0b01001110);
+	&pxor		($T1,$Hkey);		# Karatsuba pre-processing
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&pxor		($T2,$Xi);		# Karatsuba pre-processing
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+	&palignr	($T2,$T1,8);		# low part is H.lo^H.hi
+	&movdqu		(&QWP(32,$Htbl),$T2);	# save Karatsuba "salt"
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+	&movups		($T2,&QWP(32,$Htbl));
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&movdqu		($T3,&QWP(32,$Htbl));
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&pshufd		($T1,$Xn,0b01001110);	# H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	&pxor		($T1,$Xn);		#
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&nop		();
+
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+	&jmp		(&label("mod_loop"));
+
+&set_label("mod_loop",32);
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+	&nop		();
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&movdqa		($T3,&QWP(0,$const));
+	&xorps		($Xhi,$Xhn);
+	 &movdqu	($Xhn,&QWP(0,$inp));	# Ii
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	 &movdqu	($Xn,&QWP(16,$inp));	# Ii+1
+	&pxor		($T1,$Xhi);		#
+
+	 &pshufb	($Xhn,$T3);
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+	 &pshufb	($Xn,$T3);
+	 &pxor		($Xhi,$Xhn);		# "Ii+Xi", consume early
+
+	&movdqa		($Xhn,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	  &movdqa	($T2,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &movdqa	($T1,$Xi);
+	  &psllq	($Xi,5);
+	  &pxor		($T1,$Xi);		#
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	&movups		($T3,&QWP(32,$Htbl));
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T1,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T1,8);		#
+	  &pxor		($Xi,$T2);
+	  &pxor		($Xhi,$T1);		#
+	&pshufd		($T1,$Xhn,0b01001110);
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,1);
+	&pxor		($T1,$Xhn);
+	  &pxor		($Xhi,$T2);		#
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	  &pxor		($T2,$Xi);
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$Xhi)		#
+	&pclmulqdq	($T1,$T3,0x00);		#######
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&pshufd		($T2,$Xi,0b01001110);	# H^2*(Ii+Xi)
+	&movdqa		($Xhi,$Xi);
+	&pxor		($T2,$Xi);		#
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T2,$T3,0x10);		#######
+	&movdqa		($T3,&QWP(0,$const));
+
+	&xorps		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&xorps		($Xhi,$Xhn);
+	&pxor		($T1,$Xi);		# aggregated Karatsuba post-processing
+	&pxor		($T1,$Xhi);		#
+
+	&pxor		($T2,$T1);		#
+
+	&movdqa		($T1,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T1,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T1);		#
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorithm 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghashp8-ppc.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghashp8-ppc.pl
@@ -0,0 +1,669 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open OUT,"| $^X \"$xlate\" $flavour \"$output\"" || die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$IN,$H,$t1		# twisted H
+
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	li		r8,0x40
+	stvx_u		$H, r9,r3
+	li		r9,0x50
+	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+___
+}
+$code.=<<___;
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subic.		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x
+
+.align	5
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
+	 add		$inp,$inp,r0
+
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+
+Leven:
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/asm/ghashv8-armx.pl
@@ -0,0 +1,883 @@
+#! /usr/bin/env perl
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+# Initial version was developed in tight cooperation with Ard Biesheuvel
+# of Linaro from bits-n-pieces from other assembly modules. Just like
+# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
+#
+# July 2014
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction and
+# improve performance by 20-70% depending on processor.
+#
+# Current performance in cycles per processed byte:
+#
+#           64-bit PMULL  32-bit PMULL  32-bit NEON(*)
+# Apple A7      0.58         0.92          5.62
+# Cortex-A53    0.85         1.01          8.39
+# Cortex-A57    0.73         1.17          7.61
+# Denver        0.51         0.65          6.02
+# Mongoose      0.65         1.10          8.06
+# Kryo          0.76         1.16          8.00
+#
+# (*)	presented for reference/comparison purposes;
+
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=<<___				if ($flavour !~ /64/);
+.fpu	neon
+.code	32
+#undef	__thumb2__
+___
+
+################################################################################
+# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
+#
+# input:	128-bit H - secret parameter E(K,0^128)
+# output:	precomputed table filled with degrees of twisted H;
+#		H is twisted to handle reverse bitness of GHASH;
+#		only few of 16 slots of Htable[16] are used;
+#		data is opaque to outside world (which allows to
+#		optimize the code independently);
+#
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64		{$t1},[x1]		@ load input H
+	vmov.i8		$xC2,#0xe1
+	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+	vext.8		$IN,$t1,$t1,#8
+	vshr.u64	$t2,$xC2,#63
+	vdup.32		$t1,${t1}[1]
+	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
+	vshr.u64	$t2,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t2,$t2,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t2,$t2,$t2,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t2		@ H<<<=1
+	veor		$H,$IN,$t0		@ twisted H
+        vext.8          $H, $H, $H, #8
+	vst1.64		{$H},[x0],#16		@ store Htable[0]
+
+        @ calculate H^2
+	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
+	vpmull2.p64	$Xl,$H,$H
+	veor		$t0,$t0,$H
+	vpmull.p64	$Xh,$H,$H
+	vpmull.p64	$Xm,$t0,$t0
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$t1,$Xl,$t2
+
+	vext.8		$H2,$t1,$t1,#8		@ Karatsuba pre-processing
+	veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$Hhl},[x0],#16	@ store Htable[1..2]
+	vst1.64		{$H2},[x0],#16	@ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
+
+$code.=<<___;
+
+	@ calculate H^3 and H^4
+	vpmull2.p64	$Xl,$H, $H2
+	vpmull2.p64	$Yl,$H2,$H2
+	vpmull.p64	$Xh,$H, $H2
+	vpmull.p64	$Yh,$H2,$H2
+	vpmull.p64	$Xm,$t0,$t1
+	vpmull.p64	$Ym,$t1,$t1
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	veor		$t3,$Yl,$Yh
+	veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	veor		$Ym,$Ym,$t3
+	vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$t3,$t3,$Yh
+
+	veor		$t0, $Xl,$t2		@ H^3
+	veor		$t1, $Yl,$t3		@ H^4
+
+	vext.8		$H3,$t0,$t0,#8		@ Karatsuba pre-processing
+	vext.8		$H4,$t1,$t1,#8
+	vext.8		$t2,$H2,$H2,#8
+	veor		$t0,$t0,$H3
+	veor		$t1,$t1,$H4
+	veor		$t2,$t2,$H2
+        vext.8		$H34k,$t0,$t1,#8	@ pack Karatsuba pre-processed
+	vst1.64		{$H3-$H4},[x0],#48	@ store Htable[3..5]
+
+	@ calculate H^5 and H^6
+	vpmull2.p64	$Xl,$H2, $H3
+	vpmull2.p64	$Yl,$H3,$H3
+	vpmull.p64	$Xh,$H2, $H3
+	vpmull.p64	$Yh,$H3,$H3
+	vpmull.p64	$Xm,$t0,$t2
+	vpmull.p64	$Ym,$t0,$t0
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	veor		$t3,$Yl,$Yh
+	veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	veor		$Ym,$Ym,$t3
+	vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+        veor		$t3,$t3,$Yh
+
+	veor		$t0,$Xl,$t2		    @ H^5
+	veor		$t1,$Yl,$t3		    @ H^6
+
+	vext.8		$H5, $t0, $t0,#8		@ Karatsuba pre-processing
+	vext.8		$H6, $t1, $t1,#8
+	vext.8		$t2,$H2,$H2,#8
+	veor		$t0,$t0,$H5
+	veor		$t1,$t1,$H6
+	veor		$t2,$t2,$H2
+	vext.8		$H56k,$t0,$t1,#8	@ pack Karatsuba pre-processed
+	vst1.64		{$H5-$H6},[x0],#48	@ store Htable[6..8]
+
+	@ calculate H^7 and H^8
+	vpmull2.p64	$Xl,$H2,$H5
+	vpmull2.p64	$Yl,$H2,$H6
+	vpmull.p64	$Xh,$H2,$H5
+	vpmull.p64	$Yh,$H2,$H6
+	vpmull.p64	$Xm,$t0,$t2
+	vpmull.p64	$Ym,$t1,$t2
+
+	vext.8		$t0,$Xl,$Xh,#8		@ Karatsuba post-processing
+	vext.8		$t1,$Yl,$Yh,#8
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t0
+	veor		$t3,$Yl,$Yh
+	veor		$Ym,$Ym,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+	veor		$Ym,$Ym,$t3
+	vpmull.p64	$t3,$Yl,$xC2
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Yh#lo,$Ym#hi
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vmov		$Ym#hi,$Yl#lo
+	veor		$Xl,$Xm,$t2
+	veor		$Yl,$Ym,$t3
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vext.8		$t3,$Yl,$Yl,#8
+	vpmull.p64	$Xl,$Xl,$xC2
+	vpmull.p64	$Yl,$Yl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$t3,$t3,$Yh
+	veor		$t0,$Xl,$t2		    @ H^7
+	veor		$t1,$Yl,$t3		    @ H^8
+
+	vext.8		$H7,$t0,$t0,#8		@ Karatsuba pre-processing
+	vext.8		$H8,$t1,$t1,#8
+	veor		$t0,$t0,$H7
+	veor		$t1,$t1,$H8
+	vext.8		$H78k,$t0,$t1,#8	@ pack Karatsuba pre-processed
+	vst1.64		{$H7-$H8},[x0]		@ store Htable[9..11]
+___
+}
+$code.=<<___;
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+___
+################################################################################
+# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+#
+# input:	Xi - current hash value;
+#		Htable - table precomputed in gcm_init_v8;
+# output:	Xi - next hash value Xi;
+#
+$code.=<<___;
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vext.8		$H,$H,$H,#8
+	vshl.u64	$xC2,$xC2,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$IN,$t1,$t1,#8
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+___
+################################################################################
+# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#
+# input:	table precomputed in gcm_init_v8;
+#		current hash value Xi;
+#		pointer to input data;
+#		length of input data in bytes, but divisible by block size;
+# output:	next hash value Xi;
+#
+$code.=<<___;
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+___
+$code.=<<___	if ($flavour =~ /64/);
+	cmp		$len,#64
+	b.hs		.Lgcm_ghash_v8_4x
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vstmdb		sp!,{d8-d15}		@ 32-bit ABI says so
+___
+$code.=<<___;
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+						@ "[rotated]" means that
+						@ loaded value would have
+						@ to be rotated in order to
+						@ make it appear as in
+						@ algorithm specification
+	subs		$len,$len,#32		@ see if $len is 32 or larger
+	mov		$inc,#16		@ $inc is used as post-
+						@ increment for input pointer;
+						@ as loop is modulo-scheduled
+						@ $inc is zeroed just in time
+						@ to preclude overstepping
+						@ inp[len], which means that
+						@ last block[s] are actually
+						@ loaded twice, but last
+						@ copy is not processed
+	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
+	vext.8		$H,$H,$H,#8
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H2},[$Htbl]
+	vext.8		$H2,$H2,$H2,#8
+	cclr		$inc,eq			@ is it time to zero $inc?
+	vext.8		$Xl,$Xl,$Xl,#8		@ rotate Xi
+	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
+	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
+#ifndef __ARMEB__
+	vrev64.8	$t0,$t0
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$IN,$t0,$t0,#8		@ rotate I[0]
+	b.lo		.Lodd_tail_v8		@ $len was less than 32
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+$code.=<<___;
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$In,$t1,$t1,#8
+	veor		$IN,$IN,$Xl		@ I[i]^=Xi
+	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	vpmull2.p64	$Xhn,$H,$In
+	b		.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8		$t2,$IN,$IN,#8
+	subs		$len,$len,#32		@ is there more data?
+	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
+	cclr		$inc,lo			@ is it time to zero $inc?
+
+	vpmull.p64	$Xmn,$Hhl,$t1
+	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
+	veor		$Xl,$Xl,$Xln		@ accumulate
+	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i+2]
+
+	veor		$Xh,$Xh,$Xhn
+	cclr		$inc,eq			@ is it time to zero $inc?
+	veor		$Xm,$Xm,$Xmn
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+3]
+#ifndef __ARMEB__
+	vrev64.8	$t0,$t0
+#endif
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vext.8		$In,$t1,$t1,#8
+	vext.8		$IN,$t0,$t0,#8
+	veor		$Xl,$Xm,$t2
+	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$IN,$IN,$Xh		@ accumulate $IN early
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$IN,$IN,$t2
+	veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	veor		$IN,$IN,$Xl
+	vpmull2.p64	$Xhn,$H,$In
+	b.hs		.Loop_mod2x_v8		@ there was at least 32 more bytes
+
+	veor		$Xh,$Xh,$t2
+	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
+	adds		$len,$len,#32		@ re-construct $len
+	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
+	b.eq		.Ldone_v8		@ is $len zero?
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}		@ 32-bit ABI says so
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+
+if ($flavour =~ /64/) {				# 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	vld1.64		{$H-$H2},[$Htbl],#48	@ load twisted H, ..., H^2
+	vext.8		$H,$H,$H,#8
+	vext.8		$H2,$H2,$H2,#8
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H3-$H4},[$Htbl]	@ load twisted H^3, ..., H^4
+	vext.8		$H3,$H3,$H3,#8
+	vext.8		$H4,$H4,$H4,#8
+	vshl.u64	$xC2,$xC2,#57		@ compose 0xc2.0 constant
+
+	vld1.64		{$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+	vrev64.8	$j1,$j1
+	vrev64.8	$j2,$j2
+	vrev64.8	$j3,$j3
+	vrev64.8	$I0,$I0
+#endif
+	vext.8		$I3,$j3,$j3,#8
+	vext.8		$I2,$j2,$j2,#8
+	vext.8		$I1,$j1,$j1,#8
+
+	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	veor		$j3,$j3,$I3
+	vpmull2.p64	$Yh,$H,$I3
+	vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	veor		$j2,$j2,$I2
+	vpmull2.p64	$I2,$H2,$I2
+	vpmull2.p64	$j2,$Hhl,$j2
+
+	veor		$Yl,$Yl,$t0
+	veor		$Yh,$Yh,$I2
+	veor		$Ym,$Ym,$j2
+
+	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	veor		$j1,$j1,$I1
+	vpmull2.p64	$I1,$H3,$I1
+	vpmull.p64	$j1,$H34,$j1
+
+	veor		$Yl,$Yl,$j3
+	veor		$Yh,$Yh,$I1
+	veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#128
+	b.lo		.Ltail4x
+
+	b		.Loop4x
+
+.align	4
+.Loop4x:
+	veor		$t0,$I0,$Xl
+	vld1.64	{$I0-$j3},[$inp],#64
+	vext.8		$IN,$t0,$t0,#8
+#ifndef __ARMEB__
+	vrev64.8	$j1,$j1
+	vrev64.8	$j2,$j2
+	vrev64.8	$j3,$j3
+	vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	vext.8		$I3,$j3,$j3,#8
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	vext.8		$I2,$j2,$j2,#8
+	veor		$Xm,$Xm,$Ym
+	vext.8		$I1,$j1,$j1,#8
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	vpmull.p64	$Yl,$H,$I3		@ H·Ii+3
+	veor		$j3,$j3,$I3
+	veor		$Xm,$Xm,$t1
+	vpmull2.p64	$Yh,$H,$I3
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$Ym,$Hhl,$j3
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vpmull.p64	$t0,$H2,$I2		@ H^2·Ii+2
+	veor		$j2,$j2,$I2
+	vpmull2.p64	$I2,$H2,$I2
+	veor		$Xl,$Xm,$t2
+	vpmull2.p64	$j2,$Hhl,$j2
+
+	veor		$Yl,$Yl,$t0
+	veor		$Yh,$Yh,$I2
+	veor		$Ym,$Ym,$j2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	vpmull.p64	$j3,$H3,$I1		@ H^3·Ii+1
+	veor		$j1,$j1,$I1
+	veor		$t2,$t2,$Xh
+	vpmull2.p64	$I1,$H3,$I1
+	vpmull.p64	$j1,$H34,$j1
+
+	veor		$Xl,$Xl,$t2
+	veor		$Yl,$Yl,$j3
+	veor		$Yh,$Yh,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+	veor		$Ym,$Ym,$j1
+
+	subs		$len,$len,#64
+	b.hs		.Loop4x
+
+.Ltail4x:
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H4,$IN		@ H^4·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H4,$IN
+	vpmull2.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+
+	adds		$len,$len,#64
+	b.eq		.Ldone4x
+
+	cmp		$len,#32
+	b.lo		.Lone
+	b.eq		.Ltwo
+.Lthree:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	vld1.64	{$I0-$j2},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	vrev64.8	$j1,$j1
+	vrev64.8	$j2,$j2
+	vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vext.8		$I2,$j2,$j2,#8
+	vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	vpmull.p64	$Yl,$H,$I2		@ H·Ii+2
+	veor		$j2,$j2,$I2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	vpmull2.p64	$Yh,$H,$I2
+	vpmull.p64	$Ym,$Hhl,$j2
+	veor		$Xl,$Xl,$t2
+	vpmull.p64	$j3,$H2,$I1		@ H^2·Ii+1
+	veor		$j1,$j1,$I1
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	vpmull2.p64	$I1,$H2,$I1
+	veor		$t0,$I0,$Xl
+	vpmull2.p64	$j1,$Hhl,$j1
+	vext.8		$IN,$t0,$t0,#8
+
+	veor		$Yl,$Yl,$j3
+	veor		$Yh,$Yh,$I1
+	veor		$Ym,$Ym,$j1
+
+	vpmull.p64	$Xl,$H3,$IN		@ H^3·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H3,$IN
+	vpmull.p64	$Xm,$H34,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Ltwo:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	vld1.64	{$I0-$j1},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	vrev64.8	$j1,$j1
+	vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	vext.8		$I1,$j1,$j1,#8
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	vpmull.p64	$Yl,$H,$I1		@ H·Ii+1
+	veor		$j1,$j1,$I1
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull2.p64	$Yh,$H,$I1
+	vpmull.p64	$Ym,$Hhl,$j1
+
+	vpmull.p64	$Xl,$H2,$IN		@ H^2·(Xi+Ii)
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H2,$IN
+	vpmull2.p64	$Xm,$Hhl,$t0
+
+	veor		$Xl,$Xl,$Yl
+	veor		$Xh,$Xh,$Yh
+	veor		$Xm,$Xm,$Ym
+	b		.Ldone4x
+
+.align	4
+.Lone:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	vld1.64	{$I0},[$inp]
+	veor		$Xm,$Xm,$t2
+#ifndef	__ARMEB__
+	vrev64.8	$I0,$I0
+#endif
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+	veor		$t0,$I0,$Xl
+	vext.8		$IN,$t0,$t0,#8
+
+	vpmull.p64	$Xl,$H,$IN
+	veor		$t0,$t0,$IN
+	vpmull2.p64	$Xh,$H,$IN
+	vpmull.p64	$Xm,$Hhl,$t0
+
+.Ldone4x:
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase of reduction
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase of reduction
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+	vext.8		$Xl,$Xl,$Xl,#8
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
+}
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+#endif
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+					     $3<8?$3:$3+8,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remaining legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	# Switch preprocessor checks to aarch64 versions.
+	s/__ARME([BL])__/__AARCH64E$1__/go;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remaining new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT or die "error closing STDOUT"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/cbc.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/cbc.c
@@ -0,0 +1,115 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/type_check.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           block128_f block) {
+  assert(key != NULL && ivec != NULL);
+  if (len == 0) {
+    // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
+    return;
+  }
+
+  assert(in != NULL && out != NULL);
+  size_t n;
+  const uint8_t *iv = ivec;
+  while (len >= 16) {
+    CRYPTO_xor16(out, in, iv);
+    (*block)(out, out, key);
+    iv = out;
+    len -= 16;
+    in += 16;
+    out += 16;
+  }
+
+  if (len > 0) {
+    for (n = 0; n < 16 && n < len; ++n) {
+      out[n] = in[n] ^ iv[n];
+    }
+    for (; n < 16; ++n) {
+      out[n] = iv[n];
+    }
+    (*block)(out, out, key);
+    iv = out;
+  }
+
+  OPENSSL_memcpy(ivec, iv, 16);
+}
+
+void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           block128_f block) {
+  assert(key != NULL && ivec != NULL);
+  if (len == 0) {
+    // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
+    return;
+  }
+
+  assert(in != NULL && out != NULL);
+
+  const uintptr_t inptr = (uintptr_t) in;
+  const uintptr_t outptr = (uintptr_t) out;
+  // If |in| and |out| alias, |in| must be ahead.
+  assert(inptr >= outptr || inptr + len <= outptr);
+
+  size_t n;
+  alignas(16) uint8_t tmp[16];
+  if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) {
+    // If |out| is at least two blocks behind |in| or completely disjoint, there
+    // is no need to decrypt to a temporary block.
+    const uint8_t *iv = ivec;
+    while (len >= 16) {
+      (*block)(in, out, key);
+      CRYPTO_xor16(out, out, iv);
+      iv = in;
+      len -= 16;
+      in += 16;
+      out += 16;
+    }
+    OPENSSL_memcpy(ivec, iv, 16);
+  } else {
+    OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
+                          block_cannot_be_evenly_divided_into_crypto_word_t)
+
+    while (len >= 16) {
+      (*block)(in, tmp, key);
+      for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t c = CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(tmp + n) ^
+                                          CRYPTO_load_word_le(ivec + n));
+        CRYPTO_store_word_le(ivec + n, c);
+      }
+      len -= 16;
+      in += 16;
+      out += 16;
+    }
+  }
+
+  while (len) {
+    uint8_t c;
+    (*block)(in, tmp, key);
+    for (n = 0; n < 16 && n < len; ++n) {
+      c = in[n];
+      out[n] = tmp[n] ^ ivec[n];
+      ivec[n] = c;
+    }
+    if (len <= 16) {
+      for (; n < 16; ++n) {
+        ivec[n] = in[n];
+      }
+      break;
+    }
+    len -= 16;
+    in += 16;
+    out += 16;
+  }
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/cfb.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/cfb.c
@@ -0,0 +1,158 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/type_check.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+                      cfb_block_cannot_be_divided_into_size_t)
+
+void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16], unsigned *num,
+                           int enc, block128_f block) {
+  assert(in && out && key && ivec && num);
+
+  unsigned n = *num;
+
+  if (enc) {
+    while (n && len) {
+      *(out++) = ivec[n] ^= *(in++);
+      --len;
+      n = (n + 1) % 16;
+    }
+    while (len >= 16) {
+      (*block)(ivec, ivec, key);
+      for (; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t tmp =
+            CRYPTO_load_word_le(ivec + n) ^ CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(ivec + n, tmp);
+        CRYPTO_store_word_le(out + n, tmp);
+      }
+      len -= 16;
+      out += 16;
+      in += 16;
+      n = 0;
+    }
+    if (len) {
+      (*block)(ivec, ivec, key);
+      while (len--) {
+        out[n] = ivec[n] ^= in[n];
+        ++n;
+      }
+    }
+    *num = n;
+    return;
+  } else {
+    while (n && len) {
+      uint8_t c;
+      *(out++) = ivec[n] ^ (c = *(in++));
+      ivec[n] = c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    while (len >= 16) {
+      (*block)(ivec, ivec, key);
+      for (; n < 16; n += sizeof(crypto_word_t)) {
+        crypto_word_t t = CRYPTO_load_word_le(in + n);
+        CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(ivec + n) ^ t);
+        CRYPTO_store_word_le(ivec + n, t);
+      }
+      len -= 16;
+      out += 16;
+      in += 16;
+      n = 0;
+    }
+    if (len) {
+      (*block)(ivec, ivec, key);
+      while (len--) {
+        uint8_t c;
+        out[n] = ivec[n] ^ (c = in[n]);
+        ivec[n] = c;
+        ++n;
+      }
+    }
+    *num = n;
+    return;
+  }
+}
+
+
+/* This expects a single block of size nbits for both in and out. Note that
+   it corrupts any extra bits in the last byte of out */
+static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits,
+                               const AES_KEY *key, uint8_t ivec[16], int enc,
+                               block128_f block) {
+  int n, rem, num;
+  uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one
+                               byte off the end */
+
+  if (nbits <= 0 || nbits > 128) {
+    return;
+  }
+
+  // fill in the first half of the new IV with the current IV
+  OPENSSL_memcpy(ovec, ivec, 16);
+  // construct the new IV
+  (*block)(ivec, ivec, key);
+  num = (nbits + 7) / 8;
+  if (enc) {
+    // encrypt the input
+    for (n = 0; n < num; ++n) {
+      out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
+    }
+  } else {
+    // decrypt the input
+    for (n = 0; n < num; ++n) {
+      out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
+    }
+  }
+  // shift ovec left...
+  rem = nbits % 8;
+  num = nbits / 8;
+  if (rem == 0) {
+    OPENSSL_memcpy(ivec, ovec + num, 16);
+  } else {
+    for (n = 0; n < 16; ++n) {
+      ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
+    }
+  }
+
+  // it is not necessary to cleanse ovec, since the IV is not secret
+}
+
+// N.B. This expects the input to be packed, MS bit first
+void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             unsigned *num, int enc, block128_f block) {
+  size_t n;
+  uint8_t c[1], d[1];
+
+  assert(in && out && key && ivec && num);
+  assert(*num == 0);
+
+  for (n = 0; n < bits; ++n) {
+    c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
+    cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
+    out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
+                 ((d[0] & 0x80) >> (unsigned int)(n % 8));
+  }
+}
+
+void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
+                             size_t length, const AES_KEY *key,
+                             unsigned char ivec[16], unsigned *num, int enc,
+                             block128_f block) {
+  size_t n;
+
+  assert(in && out && key && ivec && num);
+  assert(*num == 0);
+
+  for (n = 0; n < length; ++n) {
+    cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
+  }
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/ctr.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/ctr.c
@@ -0,0 +1,153 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/type_check.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// NOTE: the IV/counter CTR mode is big-endian.  The code itself
+// is endian-neutral.
+
+// increment counter (128-bit int) by 1
+static void ctr128_inc(uint8_t *counter) {
+  uint32_t n = 16, c = 1;
+
+  do {
+    --n;
+    c += counter[n];
+    counter[n] = (uint8_t) c;
+    c >>= 8;
+  } while (n);
+}
+
+OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
+                      ctr_block_cannot_be_divided_into_crypto_word_t)
+
+// The input encrypted as though 128bit counter mode is being used.  The extra
+// state information to record how much of the 128bit block we have used is
+// contained in *num, and the encrypted counter is kept in ecount_buf.  Both
+// *num and ecount_buf must be initialised with zeros before the first call to
+// CRYPTO_ctr128_encrypt().
+//
+// This algorithm assumes that the counter is in the x lower bits of the IV
+// (ivec), and that the application has full control over overflow and the rest
+// of the IV.  This implementation takes NO responsibility for checking that
+// the counter doesn't overflow into the rest of the IV when incremented.
+void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           uint8_t ecount_buf[16], unsigned int *num,
+                           block128_f block) {
+  unsigned int n;
+
+  assert(key && ecount_buf && num);
+  assert(len == 0 || (in && out));
+  assert(*num < 16);
+
+  n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ecount_buf[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+  while (len >= 16) {
+    (*block)(ivec, ecount_buf, key);
+    ctr128_inc(ivec);
+    CRYPTO_xor16(out, in, ecount_buf);
+    len -= 16;
+    out += 16;
+    in += 16;
+    n = 0;
+  }
+  if (len) {
+    (*block)(ivec, ecount_buf, key);
+    ctr128_inc(ivec);
+    while (len--) {
+      out[n] = in[n] ^ ecount_buf[n];
+      ++n;
+    }
+  }
+  *num = n;
+}
+
+// increment upper 96 bits of 128-bit counter by 1
+static void ctr96_inc(uint8_t *counter) {
+  uint32_t n = 12, c = 1;
+
+  do {
+    --n;
+    c += counter[n];
+    counter[n] = (uint8_t) c;
+    c >>= 8;
+  } while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
+                                 const AES_KEY *key, uint8_t ivec[16],
+                                 uint8_t ecount_buf[16], unsigned int *num,
+                                 ctr128_f func) {
+  unsigned int n, ctr32;
+
+  assert(key && ecount_buf && num);
+  assert(len == 0 || (in && out));
+  assert(*num < 16);
+
+  n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ecount_buf[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+
+  ctr32 = CRYPTO_load_u32_be(ivec + 12);
+  while (len >= 16) {
+    size_t blocks = len / 16;
+    // 1<<28 is just a not-so-small yet not-so-large number...
+    // Below condition is practically never met, but it has to
+    // be checked for code correctness.
+    if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) {
+      blocks = (1U << 28);
+    }
+    // As (*func) operates on 32-bit counter, caller
+    // has to handle overflow. 'if' below detects the
+    // overflow, which is then handled by limiting the
+    // amount of blocks to the exact overflow point...
+    ctr32 += (uint32_t)blocks;
+    if (ctr32 < blocks) {
+      blocks -= ctr32;
+      ctr32 = 0;
+    }
+    (*func)(in, out, blocks, key, ivec);
+    // (*func) does not update ivec, caller does:
+    CRYPTO_store_u32_be(ivec + 12, ctr32);
+    // ... overflow was detected, propogate carry.
+    if (ctr32 == 0) {
+      ctr96_inc(ivec);
+    }
+    blocks *= 16;
+    len -= blocks;
+    out += blocks;
+    in += blocks;
+  }
+  if (len) {
+    OPENSSL_memset(ecount_buf, 0, 16);
+    (*func)(ecount_buf, ecount_buf, 1, key, ivec);
+    ++ctr32;
+    CRYPTO_store_u32_be(ivec + 12, ctr32);
+    if (ctr32 == 0) {
+      ctr96_inc(ivec);
+    }
+    while (len--) {
+      out[n] = in[n] ^ ecount_buf[n];
+      ++n;
+    }
+  }
+
+  *num = n;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm.c
@@ -0,0 +1,850 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/base.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
+// bits of a |size_t|.
+static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
+
+
+#define GCM_MUL(ctx, Xi) gcm_gmult_nohw((ctx)->Xi, (ctx)->gcm_key.Htable)
+#define GHASH(ctx, in, len) \
+  gcm_ghash_nohw((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
+// GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+// trashing effect. In other words idea is to hash data while it's
+// still in L1 cache after encryption pass...
+#define GHASH_CHUNK (3 * 1024)
+
+#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
+static inline void gcm_reduce_1bit(u128 *V) {
+  if (sizeof(crypto_word_t) == 8) {
+    uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V->hi & 1));
+    V->hi = (V->lo << 63) | (V->hi >> 1);
+    V->lo = (V->lo >> 1) ^ T;
+  } else {
+    uint32_t T = 0xe1000000U & (0 - (uint32_t)(V->hi & 1));
+    V->hi = (V->lo << 63) | (V->hi >> 1);
+    V->lo = (V->lo >> 1) ^ ((uint64_t)T << 32);
+  }
+}
+
+void gcm_init_ssse3(u128 Htable[16], const uint64_t H[2]) {
+  Htable[0].hi = 0;
+  Htable[0].lo = 0;
+  u128 V;
+  V.hi = H[1];
+  V.lo = H[0];
+
+  Htable[8] = V;
+  gcm_reduce_1bit(&V);
+  Htable[4] = V;
+  gcm_reduce_1bit(&V);
+  Htable[2] = V;
+  gcm_reduce_1bit(&V);
+  Htable[1] = V;
+  Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
+  V = Htable[4];
+  Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
+  Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
+  Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
+  V = Htable[8];
+  Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
+  Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
+  Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
+  Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
+  Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
+  Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
+  Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
+
+  // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i]
+  // contains the i'th byte of j*H for all j.
+  uint8_t *Hbytes = (uint8_t *)Htable;
+  for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < i; j++) {
+      uint8_t tmp = Hbytes[16*i + j];
+      Hbytes[16*i + j] = Hbytes[16*j + i];
+      Hbytes[16*j + i] = tmp;
+    }
+  }
+}
+#endif  // GHASH_ASM_X86_64 || GHASH_ASM_X86
+
+#ifdef GCM_FUNCREF
+#undef GCM_MUL
+#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi, (ctx)->gcm_key.Htable)
+#undef GHASH
+#define GHASH(ctx, in, len) \
+  (*gcm_ghash_p)((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
+#endif  // GCM_FUNCREF
+
+#if defined(HW_GCM) && defined(OPENSSL_X86_64)
+static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
+  return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
+}
+
+static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
+  return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
+}
+#endif  // HW_GCM && X86_64
+
+#if defined(HW_GCM) && defined(OPENSSL_AARCH64)
+
+static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
+  const size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (!len_blocks) {
+    return 0;
+  }
+
+  // The 8x-unrolled assembly implementation starts outperforming
+  // the 4x-unrolled one starting around input length of 256 bytes
+  // in the case of the EVP API.
+  // In the case of the AEAD API, it can be used for all input lengths
+  // but we are not identifying which API calls the code below.
+  if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) {
+    switch(key->rounds) {
+    case 10:
+      aesv8_gcm_8x_enc_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    case 12:
+      aesv8_gcm_8x_enc_192(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    case 14:
+      aesv8_gcm_8x_enc_256(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    default:
+      // The subsequent logic after returning can process
+      // the input or return an error.
+      return 0;
+      break;
+    }
+  } else {
+    aes_gcm_enc_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+  }
+
+  return len_blocks;
+}
+
+static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             uint8_t Xi[16], const u128 Htable[16]) {
+  const size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (!len_blocks) {
+    return 0;
+  }
+
+  // The 8x-unrolled assembly implementation starts outperforming
+  // the 4x-unrolled one starting around input length of 256 bytes
+  // in the case of the EVP API.
+  // In the case of the AEAD API, it can be used for all input lengths
+  // but we are not identifying which API calls the code below.
+  if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) {
+    switch(key->rounds) {
+    case 10:
+      aesv8_gcm_8x_dec_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    case 12:
+      aesv8_gcm_8x_dec_192(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    case 14:
+      aesv8_gcm_8x_dec_256(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+      break;
+    default:
+      // The subsequent logic after returning can process
+      // the input or return an error.
+      return 0;
+      break;
+    }
+  } else {
+    aes_gcm_dec_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable);
+  }
+
+  return len_blocks;
+}
+
+#endif  // HW_GCM && AARCH64
+
+// Trampolines for GCM function pointers to avoid delocator issues with adr
+// on AArch64. Without these wrappers, the function pointer calculations
+// may require PC-relative offsets outside the addressable range.
+#if defined(GHASH_ASM_ARM)
+static inline void gcm_gmult_v8_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
+  gcm_gmult_v8(Xi, Htable);
+}
+
+static inline void gcm_ghash_v8_wrapper(uint8_t Xi[16], const u128 Htable[16],
+                                        const uint8_t *inp, size_t len) {
+  gcm_ghash_v8(Xi, Htable, inp, len);
+}
+
+static inline void gcm_gmult_neon_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
+  gcm_gmult_neon(Xi, Htable);
+}
+
+static inline void gcm_ghash_neon_wrapper(uint8_t Xi[16], const u128 Htable[16],
+                                          const uint8_t *inp, size_t len) {
+  gcm_ghash_neon(Xi, Htable, inp, len);
+}
+#endif
+
+void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
+                       u128 out_table[16], int *out_is_avx,
+                       const uint8_t gcm_key[16]) {
+  *out_is_avx = 0;
+
+  // H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values.
+  uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key),
+                   CRYPTO_load_u64_be(gcm_key + 8)};
+
+#if defined(GHASH_ASM_X86_64)
+#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (crypto_gcm_avx512_enabled()) {
+    gcm_init_avx512(out_table, H);
+    *out_mult = gcm_gmult_avx512;
+    *out_hash = gcm_ghash_avx512;
+    *out_is_avx = 1;
+    return;
+  }
+#endif
+  if (crypto_gcm_clmul_enabled()) {
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
+      gcm_init_avx(out_table, H);
+      *out_mult = gcm_gmult_avx;
+      *out_hash = gcm_ghash_avx;
+      *out_is_avx = 1;
+      return;
+    }
+    gcm_init_clmul(out_table, H);
+    *out_mult = gcm_gmult_clmul;
+    *out_hash = gcm_ghash_clmul;
+    return;
+  }
+  if (CRYPTO_is_SSSE3_capable()) {
+    gcm_init_ssse3(out_table, H);
+    *out_mult = gcm_gmult_ssse3;
+    *out_hash = gcm_ghash_ssse3;
+    return;
+  }
+#elif defined(GHASH_ASM_X86)
+  if (crypto_gcm_clmul_enabled()) {
+    gcm_init_clmul(out_table, H);
+    *out_mult = gcm_gmult_clmul;
+    *out_hash = gcm_ghash_clmul;
+    return;
+  }
+  if (CRYPTO_is_SSSE3_capable()) {
+    gcm_init_ssse3(out_table, H);
+    *out_mult = gcm_gmult_ssse3;
+    *out_hash = gcm_ghash_ssse3;
+    return;
+  }
+#elif defined(GHASH_ASM_ARM)
+  if (gcm_pmull_capable()) {
+    gcm_init_v8(out_table, H);
+    *out_mult = gcm_gmult_v8_wrapper;
+    *out_hash = gcm_ghash_v8_wrapper;
+    return;
+  }
+
+  if (gcm_neon_capable()) {
+    gcm_init_neon(out_table, H);
+    *out_mult = gcm_gmult_neon_wrapper;
+    *out_hash = gcm_ghash_neon_wrapper;
+    return;
+  }
+#elif defined(GHASH_ASM_PPC64LE)
+  if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
+    gcm_init_p8(out_table, H);
+    *out_mult = gcm_gmult_p8;
+    *out_hash = gcm_ghash_p8;
+    return;
+  }
+#endif
+
+  gcm_init_nohw(out_table, H);
+  *out_mult = gcm_gmult_nohw;
+  *out_hash = gcm_ghash_nohw;
+}
+
+void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key, const AES_KEY *aes_key,
+                            block128_f block, int block_is_hwaes) {
+  OPENSSL_memset(gcm_key, 0, sizeof(*gcm_key));
+  gcm_key->block = block;
+
+  uint8_t ghash_key[16];
+  OPENSSL_memset(ghash_key, 0, sizeof(ghash_key));
+  (*block)(ghash_key, ghash_key, aes_key);
+
+  int is_avx;
+  CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable, &is_avx,
+                    ghash_key);
+
+#if defined(OPENSSL_AARCH64) && defined(GHASH_ASM_ARM)
+  gcm_key->use_hw_gcm_crypt = (gcm_pmull_capable() && block_is_hwaes) ? 1 : 0;
+#else
+  gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                         const uint8_t *iv, size_t len) {
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+#endif
+
+  OPENSSL_memset(&ctx->Yi, 0, sizeof(ctx->Yi));
+  OPENSSL_memset(&ctx->Xi, 0, sizeof(ctx->Xi));
+  ctx->len.aad = 0;
+  ctx->len.msg = 0;
+  ctx->ares = 0;
+  ctx->mres = 0;
+
+#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled()) {
+    gcm_setiv_avx512(key, ctx, iv, len);
+    return;
+  }
+#endif
+
+  uint32_t ctr;
+  if (len == 12) {
+    OPENSSL_memcpy(ctx->Yi, iv, 12);
+    ctx->Yi[15] = 1;
+    ctr = 1;
+  } else {
+    uint64_t len0 = len;
+
+    while (len >= 16) {
+      CRYPTO_xor16(ctx->Yi, ctx->Yi, iv);
+      GCM_MUL(ctx, Yi);
+      iv += 16;
+      len -= 16;
+    }
+    if (len) {
+      for (size_t i = 0; i < len; ++i) {
+        ctx->Yi[i] ^= iv[i];
+      }
+      GCM_MUL(ctx, Yi);
+    }
+
+    uint8_t len_block[16];
+    OPENSSL_memset(len_block, 0, 8);
+    CRYPTO_store_u64_be(len_block + 8, len0 << 3);
+    CRYPTO_xor16(ctx->Yi, ctx->Yi, len_block);
+
+    GCM_MUL(ctx, Yi);
+    ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
+  }
+
+  (*ctx->gcm_key.block)(ctx->Yi, ctx->EK0, key);
+  ++ctr;
+  CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->gcm_key.ghash;
+#endif
+
+  if (ctx->len.msg != 0) {
+    // The caller must have finished the AAD before providing other input.
+    return 0;
+  }
+
+  uint64_t alen = ctx->len.aad + len;
+  if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
+    return 0;
+  }
+  ctx->len.aad = alen;
+
+  unsigned n = ctx->ares;
+  if (n) {
+    while (n && len) {
+      ctx->Xi[n] ^= *(aad++);
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->ares = n;
+      return 1;
+    }
+  }
+
+  // Process a whole number of blocks.
+  size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (len_blocks != 0) {
+    GHASH(ctx, aad, len_blocks);
+    aad += len_blocks;
+    len -= len_blocks;
+  }
+
+  // Process the remainder.
+  if (len != 0) {
+    // This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
+    // .../aws-lc/crypto/fipsmodule/modes/gcm.c:428:18: error: writing 1 byte into
+    // a region of size 0 [-Werror=stringop-overflow=]
+    // 428 | ctx->Xi[i] ^= aad[i];
+    //     | ~~~~~~~~~~~^~~~~~~~~
+    if (len > 16) {
+      abort();
+      return 0;
+    }
+    n = (unsigned int)len;
+    for (size_t i = 0; i < len; ++i) {
+      ctx->Xi[i] ^= aad[i];
+    }
+  }
+
+  ctx->ares = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                          const uint8_t *in, uint8_t *out, size_t len) {
+  block128_f block = ctx->gcm_key.block;
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->gcm_key.ghash;
+#endif
+
+  uint64_t mlen = ctx->len.msg + len;
+  if (mlen > ((UINT64_C(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.msg = mlen;
+
+  if (ctx->ares) {
+    // First call to encrypt finalizes GHASH(AAD)
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  unsigned n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
+  while (len >= GHASH_CHUNK) {
+    size_t j = GHASH_CHUNK;
+
+    while (j) {
+      (*block)(ctx->Yi, ctx->EKi, key);
+      ++ctr;
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
+      out += 16;
+      in += 16;
+      j -= 16;
+    }
+    GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
+    len -= GHASH_CHUNK;
+  }
+  size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (len_blocks != 0) {
+    while (len >= 16) {
+      (*block)(ctx->Yi, ctx->EKi, key);
+      ++ctr;
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
+      out += 16;
+      in += 16;
+      len -= 16;
+    }
+    GHASH(ctx, out - len_blocks, len_blocks);
+  }
+  if (len) {
+    (*block)(ctx->Yi, ctx->EKi, key);
+    ++ctr;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    while (len--) {
+      ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                          const unsigned char *in, unsigned char *out,
+                          size_t len) {
+  block128_f block = ctx->gcm_key.block;
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->gcm_key.ghash;
+#endif
+
+  uint64_t mlen = ctx->len.msg + len;
+  if (mlen > ((UINT64_C(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.msg = mlen;
+
+  if (ctx->ares) {
+    // First call to decrypt finalizes GHASH(AAD)
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+  unsigned n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      uint8_t c = *(in++);
+      *(out++) = c ^ ctx->EKi[n];
+      ctx->Xi[n] ^= c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
+  while (len >= GHASH_CHUNK) {
+    size_t j = GHASH_CHUNK;
+
+    GHASH(ctx, in, GHASH_CHUNK);
+    while (j) {
+      (*block)(ctx->Yi, ctx->EKi, key);
+      ++ctr;
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
+      out += 16;
+      in += 16;
+      j -= 16;
+    }
+    len -= GHASH_CHUNK;
+  }
+  size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (len_blocks != 0) {
+    GHASH(ctx, in, len_blocks);
+    while (len >= 16) {
+      (*block)(ctx->Yi, ctx->EKi, key);
+      ++ctr;
+      CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+      CRYPTO_xor16(out, in, ctx->EKi);
+      out += 16;
+      in += 16;
+      len -= 16;
+    }
+  }
+  if (len) {
+    (*block)(ctx->Yi, ctx->EKi, key);
+    ++ctr;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    while (len--) {
+      uint8_t c = in[n];
+      ctx->Xi[n] ^= c;
+      out[n] = c ^ ctx->EKi[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                                const uint8_t *in, uint8_t *out, size_t len,
+                                ctr128_f stream) {
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->gcm_key.ghash;
+#endif
+
+  uint64_t mlen = ctx->len.msg + len;
+  if (mlen > ((UINT64_C(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.msg = mlen;
+
+  if (ctx->ares) {
+    // First call to encrypt finalizes GHASH(AAD)
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled() && len > 0) {
+    aes_gcm_encrypt_avx512(key, ctx, &ctx->mres, in, len, out);
+    return 1;
+  }
+#endif
+
+  unsigned n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+
+#if defined(HW_GCM)
+  // Check |len| to work around a C language bug. See https://crbug.com/1019588.
+  if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+    // |hw_gcm_encrypt| may not process all the input given to it. It may
+    // not process *any* of its input if it is deemed too small.
+    size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi,
+                                 ctx->gcm_key.Htable);
+    in += bulk;
+    out += bulk;
+    len -= bulk;
+  }
+#endif
+
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
+  while (len >= GHASH_CHUNK) {
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
+    ctr += GHASH_CHUNK / 16;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    GHASH(ctx, out, GHASH_CHUNK);
+    out += GHASH_CHUNK;
+    in += GHASH_CHUNK;
+    len -= GHASH_CHUNK;
+  }
+  size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (len_blocks != 0) {
+    size_t j = len_blocks / 16;
+
+    (*stream)(in, out, j, key, ctx->Yi);
+    ctr += (unsigned int)j;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    in += len_blocks;
+    len -= len_blocks;
+    GHASH(ctx, out, len_blocks);
+    out += len_blocks;
+  }
+  if (len) {
+    (*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
+    ++ctr;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    // This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
+    // .../aws-lc/crypto/fipsmodule/modes/gcm.c:688:18: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
+    // 688 |       ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
+    //     |                  ^~
+    if ((n + len) > 16) {
+      abort();
+      return 0;
+    }
+    while (len--) {
+      ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                                const uint8_t *in, uint8_t *out, size_t len,
+                                ctr128_f stream) {
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+  void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                      size_t len) = ctx->gcm_key.ghash;
+#endif
+
+  uint64_t mlen = ctx->len.msg + len;
+  if (mlen > ((UINT64_C(1) << 36) - 32) ||
+      (sizeof(len) == 8 && mlen < len)) {
+    return 0;
+  }
+  ctx->len.msg = mlen;
+
+  if (ctx->ares) {
+    // First call to decrypt finalizes GHASH(AAD)
+    GCM_MUL(ctx, Xi);
+    ctx->ares = 0;
+  }
+
+#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled() && len > 0) {
+    aes_gcm_decrypt_avx512(key, ctx, &ctx->mres, in, len, out);
+    return 1;
+  }
+#endif
+
+  unsigned n = ctx->mres;
+  if (n) {
+    while (n && len) {
+      uint8_t c = *(in++);
+      *(out++) = c ^ ctx->EKi[n];
+      ctx->Xi[n] ^= c;
+      --len;
+      n = (n + 1) % 16;
+    }
+    if (n == 0) {
+      GCM_MUL(ctx, Xi);
+    } else {
+      ctx->mres = n;
+      return 1;
+    }
+  }
+
+#if defined(HW_GCM)
+  // Check |len| to work around a C language bug. See https://crbug.com/1019588.
+  if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
+    // |hw_gcm_decrypt| may not process all the input given to it. It may
+    // not process *any* of its input if it is deemed too small.
+    size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi,
+                                 ctx->gcm_key.Htable);
+    in += bulk;
+    out += bulk;
+    len -= bulk;
+  }
+#endif
+
+  uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
+  while (len >= GHASH_CHUNK) {
+    GHASH(ctx, in, GHASH_CHUNK);
+    (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
+    ctr += GHASH_CHUNK / 16;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    out += GHASH_CHUNK;
+    in += GHASH_CHUNK;
+    len -= GHASH_CHUNK;
+  }
+  size_t len_blocks = len & kSizeTWithoutLower4Bits;
+  if (len_blocks != 0) {
+    size_t j = len_blocks / 16;
+
+    GHASH(ctx, in, len_blocks);
+    (*stream)(in, out, j, key, ctx->Yi);
+    ctr += (unsigned int)j;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    out += len_blocks;
+    in += len_blocks;
+    len -= len_blocks;
+  }
+  if (len) {
+    (*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
+    ++ctr;
+    CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
+    // This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
+    // aws-lc/crypto/fipsmodule/modes/gcm.c:785:18: error: writing 1 byte into a
+    // region of size 0 [-Werror=stringop-overflow=]
+    // 785 | ctx->Xi[n] ^= c;
+    //     | ~~~~~~~~~~~^~~~
+    if ((n + len) > 16) {
+      abort();
+      return 0;
+    }
+    while (len--) {
+      uint8_t c = in[n];
+      ctx->Xi[n] ^= c;
+      out[n] = c ^ ctx->EKi[n];
+      ++n;
+    }
+  }
+
+  ctx->mres = n;
+  return 1;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
+#ifdef GCM_FUNCREF
+  void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
+      ctx->gcm_key.gmult;
+#endif
+
+  if (ctx->mres || ctx->ares) {
+    GCM_MUL(ctx, Xi);
+  }
+
+  uint8_t len_block[16];
+  CRYPTO_store_u64_be(len_block, ctx->len.aad << 3);
+  CRYPTO_store_u64_be(len_block + 8, ctx->len.msg << 3);
+  CRYPTO_xor16(ctx->Xi, ctx->Xi, len_block);
+  GCM_MUL(ctx, Xi);
+  CRYPTO_xor16(ctx->Xi, ctx->Xi, ctx->EK0);
+
+  if (tag && len <= sizeof(ctx->Xi)) {
+    return CRYPTO_memcmp(ctx->Xi, tag, len) == 0;
+  } else {
+    return 0;
+  }
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
+  CRYPTO_gcm128_finish(ctx, NULL, 0);
+  OPENSSL_memcpy(tag, ctx->Xi, len <= sizeof(ctx->Xi) ? len : sizeof(ctx->Xi));
+}
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+int crypto_gcm_clmul_enabled(void) {
+#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
+  return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable();
+#else
+  return 0;
+#endif
+}
+
+int crypto_gcm_avx512_enabled(void) {
+  // This must align with ImplDispatchTest.AEAD_AES_GCM
+#if defined(GHASH_ASM_X86_64) && \
+    !defined(OPENSSL_WINDOWS) && \
+    !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+    // TODO(awslc): remove the Windows guard once CryptoAlg-1701 is resolved.
+  return (CRYPTO_is_VAES_capable() &&
+          CRYPTO_is_AVX512_capable() &&
+          CRYPTO_is_VPCLMULQDQ_capable());
+#else
+  return 0;
+#endif
+}
+#endif
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm_nohw.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm_nohw.c
@@ -0,0 +1,291 @@
+// Copyright (c) 2019, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/base.h>
+
+#include "../../internal.h"
+#include "internal.h"
+
+#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
+#include <emmintrin.h>
+#endif
+
+
+// This file contains a constant-time implementation of GHASH based on the notes
+// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
+// algorithm described in
+// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
+//
+// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our
+// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run
+// the 32-bit implementation, but we can use its intrinsics if necessary.
+
+#if defined(BORINGSSL_HAS_UINT128)
+
+static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
+                           uint64_t b) {
+  // One term every four bits means the largest term is 64/4 = 16, which barely
+  // overflows into the next term. Using one term every five bits would cost 25
+  // multiplications instead of 16. It is faster to mask off the bottom four
+  // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
+  // separately.
+  uint64_t a0 = a & UINT64_C(0x1111111111111110);
+  uint64_t a1 = a & UINT64_C(0x2222222222222220);
+  uint64_t a2 = a & UINT64_C(0x4444444444444440);
+  uint64_t a3 = a & UINT64_C(0x8888888888888880);
+
+  uint64_t b0 = b & UINT64_C(0x1111111111111111);
+  uint64_t b1 = b & UINT64_C(0x2222222222222222);
+  uint64_t b2 = b & UINT64_C(0x4444444444444444);
+  uint64_t b3 = b & UINT64_C(0x8888888888888888);
+
+  uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^
+                 (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1);
+  uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^
+                 (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2);
+  uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^
+                 (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3);
+  uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^
+                 (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0);
+
+  // Multiply the bottom four bits of |a| with |b|.
+  uint64_t a0_mask = UINT64_C(0) - (a & 1);
+  uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1);
+  uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1);
+  uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1);
+  uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^
+                    ((uint128_t)(a2_mask & b) << 2) ^
+                    ((uint128_t)(a3_mask & b) << 3);
+
+  *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^
+            (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^
+            (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^
+            (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra);
+  *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^
+            (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^
+            (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^
+            (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^
+            ((uint64_t)(extra >> 64));
+}
+
+#elif defined(OPENSSL_SSE2)
+
+static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
+  // One term every four bits means the largest term is 32/4 = 8, which does not
+  // overflow into the next term.
+  __m128i aa = _mm_setr_epi32(a, 0, a, 0);
+  __m128i bb = _mm_setr_epi32(b, 0, b, 0);
+
+  __m128i a0a0 =
+      _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0));
+  __m128i a2a2 =
+      _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0));
+  __m128i b0b1 =
+      _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0));
+  __m128i b2b3 =
+      _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0));
+
+  __m128i c0c1 =
+      _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3));
+  __m128i c2c3 =
+      _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3));
+
+  __m128i a1a1 =
+      _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0));
+  __m128i a3a3 =
+      _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0));
+  __m128i b3b0 =
+      _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0));
+  __m128i b1b2 =
+      _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0));
+
+  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0));
+  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2));
+  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0));
+  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2));
+
+  c0c1 = _mm_and_si128(
+      c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222));
+  c2c3 = _mm_and_si128(
+      c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888));
+
+  c0c1 = _mm_xor_si128(c0c1, c2c3);
+  // c0 ^= c1
+  c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8));
+  return c0c1;
+}
+
+static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
+                           uint64_t b) {
+  uint32_t a0 = a & 0xffffffff;
+  uint32_t a1 = a >> 32;
+  uint32_t b0 = b & 0xffffffff;
+  uint32_t b1 = b >> 32;
+  // Karatsuba multiplication.
+  __m128i lo = gcm_mul32_nohw(a0, b0);
+  __m128i hi = gcm_mul32_nohw(a1, b1);
+  __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1);
+  mid = _mm_xor_si128(mid, lo);
+  mid = _mm_xor_si128(mid, hi);
+  __m128i ret = _mm_unpacklo_epi64(lo, hi);
+  mid = _mm_slli_si128(mid, 4);
+  mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0));
+  ret = _mm_xor_si128(ret, mid);
+  memcpy(out_lo, &ret, 8);
+  memcpy(out_hi, ((char*)&ret) + 8, 8);
+}
+
+#else  // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2
+
+static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) {
+  // One term every four bits means the largest term is 32/4 = 8, which does not
+  // overflow into the next term.
+  uint32_t a0 = a & 0x11111111;
+  uint32_t a1 = a & 0x22222222;
+  uint32_t a2 = a & 0x44444444;
+  uint32_t a3 = a & 0x88888888;
+
+  uint32_t b0 = b & 0x11111111;
+  uint32_t b1 = b & 0x22222222;
+  uint32_t b2 = b & 0x44444444;
+  uint32_t b3 = b & 0x88888888;
+
+  uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^
+                (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1);
+  uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^
+                (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2);
+  uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^
+                (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3);
+  uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^
+                (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0);
+
+  return (c0 & UINT64_C(0x1111111111111111)) |
+         (c1 & UINT64_C(0x2222222222222222)) |
+         (c2 & UINT64_C(0x4444444444444444)) |
+         (c3 & UINT64_C(0x8888888888888888));
+}
+
+static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
+                           uint64_t b) {
+  uint32_t a0 = a & 0xffffffff;
+  uint32_t a1 = a >> 32;
+  uint32_t b0 = b & 0xffffffff;
+  uint32_t b1 = b >> 32;
+  // Karatsuba multiplication.
+  uint64_t lo = gcm_mul32_nohw(a0, b0);
+  uint64_t hi = gcm_mul32_nohw(a1, b1);
+  uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
+  *out_lo = lo ^ (mid << 32);
+  *out_hi = hi ^ (mid >> 32);
+}
+
+#endif  // BORINGSSL_HAS_UINT128
+
+void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) {
+  // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
+  // avoids a shift by 1 in the multiplication, needed to account for bit
+  // reversal losing a bit after multiplication, that is,
+  // rev128(X) * rev128(Y) = rev255(X*Y).
+  //
+  // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
+  // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
+  //
+  // See also slide 16 of
+  // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
+  Htable[0].lo = Xi[1];
+  Htable[0].hi = Xi[0];
+
+  uint64_t carry = Htable[0].hi >> 63;
+  carry = 0u - carry;
+
+  Htable[0].hi <<= 1;
+  Htable[0].hi |= Htable[0].lo >> 63;
+  Htable[0].lo <<= 1;
+
+  // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
+  // conditionally add 0xc200...0001.
+  Htable[0].lo ^= carry & 1;
+  Htable[0].hi ^= carry & UINT64_C(0xc200000000000000);
+
+  // This implementation does not use the rest of |Htable|.
+}
+
+static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) {
+  // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
+  // through |r3|. Note there is no byte or bit reversal because we are
+  // evaluating POLYVAL.
+  uint64_t r0, r1;
+  gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo);
+  uint64_t r2, r3;
+  gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi);
+  uint64_t mid0, mid1;
+  gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo);
+  mid0 ^= r0 ^ r2;
+  mid1 ^= r1 ^ r3;
+  r2 ^= mid1;
+  r1 ^= mid0;
+
+  // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
+  // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
+  // have:
+  //
+  //       1 = x^121 + x^126 + x^127 + x^128
+  //  x^-128 = x^-7 + x^-2 + x^-1 + 1
+  //
+  // This is the GHASH reduction step, but with bits flowing in reverse.
+
+  // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
+  // another reduction steps. Instead, we gather the excess bits, incorporate
+  // them into |r0| and |r1| and reduce once. See slides 17-19
+  // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
+  r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
+
+  // 1
+  r2 ^= r0;
+  r3 ^= r1;
+
+  // x^-1
+  r2 ^= r0 >> 1;
+  r2 ^= r1 << 63;
+  r3 ^= r1 >> 1;
+
+  // x^-2
+  r2 ^= r0 >> 2;
+  r2 ^= r1 << 62;
+  r3 ^= r1 >> 2;
+
+  // x^-7
+  r2 ^= r0 >> 7;
+  r2 ^= r1 << 57;
+  r3 ^= r1 >> 7;
+
+  Xi[0] = r2;
+  Xi[1] = r3;
+}
+
+void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) {
+  uint64_t swapped[2];
+  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
+  swapped[1] = CRYPTO_load_u64_be(Xi);
+  gcm_polyval_nohw(swapped, &Htable[0]);
+  CRYPTO_store_u64_be(Xi, swapped[1]);
+  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
+}
+
+void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                    size_t len) {
+  uint64_t swapped[2];
+  swapped[0] = CRYPTO_load_u64_be(Xi + 8);
+  swapped[1] = CRYPTO_load_u64_be(Xi);
+
+  while (len >= 16) {
+    swapped[0] ^= CRYPTO_load_u64_be(inp + 8);
+    swapped[1] ^= CRYPTO_load_u64_be(inp);
+    gcm_polyval_nohw(swapped, &Htable[0]);
+    inp += 16;
+    len -= 16;
+  }
+
+  CRYPTO_store_u64_be(Xi, swapped[1]);
+  CRYPTO_store_u64_be(Xi + 8, swapped[0]);
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/gcm_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdio.h>
+#include <string.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <openssl/aes.h>
+
+#include "../../internal.h"
+#include "../../test/abi_test.h"
+#include "../../test/file_test.h"
+#include "../../test/test_util.h"
+#include "../aes/internal.h"
+#include "../cpucap/internal.h"
+#include "internal.h"
+
+
+TEST(GCMTest, TestVectors) {
+  FileTestGTest("crypto/fipsmodule/modes/gcm_tests.txt", [](FileTest *t) {
+    std::vector<uint8_t> key, plaintext, additional_data, nonce, ciphertext,
+        tag;
+    ASSERT_TRUE(t->GetBytes(&key, "Key"));
+    ASSERT_TRUE(t->GetBytes(&plaintext, "Plaintext"));
+    ASSERT_TRUE(t->GetBytes(&additional_data, "AdditionalData"));
+    ASSERT_TRUE(t->GetBytes(&nonce, "Nonce"));
+    ASSERT_TRUE(t->GetBytes(&ciphertext, "Ciphertext"));
+    ASSERT_TRUE(t->GetBytes(&tag, "Tag"));
+
+    ASSERT_EQ(plaintext.size(), ciphertext.size());
+    ASSERT_TRUE(key.size() == 16 || key.size() == 24 || key.size() == 32);
+    ASSERT_EQ(16u, tag.size());
+
+    std::vector<uint8_t> out(plaintext.size());
+    AES_KEY aes_key;
+    ASSERT_EQ(0, AES_set_encrypt_key(key.data(), key.size() * 8, &aes_key));
+
+    GCM128_CONTEXT ctx;
+    OPENSSL_memset(&ctx, 0, sizeof(ctx));
+    CRYPTO_gcm128_init_key(&ctx.gcm_key, &aes_key, AES_encrypt, 0);
+    CRYPTO_gcm128_setiv(&ctx, &aes_key, nonce.data(), nonce.size());
+    if (!additional_data.empty()) {
+      CRYPTO_gcm128_aad(&ctx, additional_data.data(), additional_data.size());
+    }
+    if (!plaintext.empty()) {
+      CRYPTO_gcm128_encrypt(&ctx, &aes_key, plaintext.data(), out.data(),
+                            plaintext.size());
+    }
+
+    std::vector<uint8_t> got_tag(tag.size());
+    CRYPTO_gcm128_tag(&ctx, got_tag.data(), got_tag.size());
+    EXPECT_EQ(Bytes(tag), Bytes(got_tag));
+    EXPECT_EQ(Bytes(ciphertext), Bytes(out));
+
+    CRYPTO_gcm128_setiv(&ctx, &aes_key, nonce.data(), nonce.size());
+    OPENSSL_memset(out.data(), 0, out.size());
+    if (!additional_data.empty()) {
+      CRYPTO_gcm128_aad(&ctx, additional_data.data(), additional_data.size());
+    }
+    if (!ciphertext.empty()) {
+      CRYPTO_gcm128_decrypt(&ctx, &aes_key, ciphertext.data(), out.data(),
+                            ciphertext.size());
+    }
+    ASSERT_TRUE(CRYPTO_gcm128_finish(&ctx, tag.data(), tag.size()));
+    EXPECT_EQ(Bytes(plaintext), Bytes(out));
+  });
+}
+
+TEST(GCMTest, ByteSwap) {
+  EXPECT_EQ(0x04030201u, CRYPTO_bswap4(0x01020304u));
+  EXPECT_EQ(UINT64_C(0x0807060504030201),
+            CRYPTO_bswap8(UINT64_C(0x0102030405060708)));
+}
+
+#if defined(SUPPORTS_ABI_TEST) && !defined(OPENSSL_NO_ASM) && \
+    !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+TEST(GCMTest, ABI) {
+  static const uint64_t kH[2] = {
+      UINT64_C(0x66e94bd4ef8a2c3b),
+      UINT64_C(0x884cfa59ca342b2e),
+  };
+  static const size_t kBlockCounts[] = {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 31, 32};
+  uint8_t buf[16 * 32];
+  OPENSSL_memset(buf, 42, sizeof(buf));
+
+  uint8_t X[16] = {0x92, 0xa3, 0xb3, 0x60, 0xce, 0xda, 0x88, 0x03,
+                   0x78, 0xfe, 0xb2, 0x71, 0xb9, 0xc2, 0x28, 0xf3};
+
+  alignas(16) u128 Htable[16];
+#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
+  if (CRYPTO_is_SSSE3_capable()) {
+    CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
+    CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks);
+    }
+  }
+
+  if (crypto_gcm_clmul_enabled()) {
+    CHECK_ABI_SEH(gcm_init_clmul, Htable, kH);
+    CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI_SEH(gcm_ghash_clmul, X, Htable, buf, 16 * blocks);
+    }
+
+#if defined(GHASH_ASM_X86_64)
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
+      CHECK_ABI_SEH(gcm_init_avx, Htable, kH);
+      CHECK_ABI_SEH(gcm_gmult_avx, X, Htable);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(gcm_ghash_avx, X, Htable, buf, 16 * blocks);
+      }
+
+      if (hwaes_capable()) {
+        AES_KEY aes_key;
+        static const uint8_t kKey[16] = {0};
+        uint8_t iv[16] = {0};
+
+        aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+        for (size_t blocks : kBlockCounts) {
+          CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16, &aes_key, iv,
+                        Htable, X);
+          CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16 + 7, &aes_key,
+                        iv, Htable, X);
+        }
+        aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+        for (size_t blocks : kBlockCounts) {
+          CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16, &aes_key, iv,
+                        Htable, X);
+          CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16 + 7, &aes_key,
+                        iv, Htable, X);
+        }
+      }
+    }
+#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+    if (crypto_gcm_avx512_enabled()) {
+      CHECK_ABI_SEH(gcm_init_avx512, Htable, kH);
+      CHECK_ABI_SEH(gcm_gmult_avx512, X, Htable);
+      for (size_t blocks : kBlockCounts) {
+        CHECK_ABI_SEH(gcm_ghash_avx512, X, Htable, buf, 16 * blocks);
+      }
+
+      if (hwaes_capable()) {
+        AES_KEY aes_key;
+        static const uint8_t kKey[16] = {0};
+
+        // aes_gcm_*_avx512 makes assumptions about |GCM128_CONTEXT|'s layout.
+        GCM128_CONTEXT gcm;
+        memset(&gcm, 0, sizeof(gcm));
+        memcpy(&gcm.gcm_key.Htable, Htable, sizeof(Htable));
+        memcpy(&gcm.Xi, X, sizeof(X));
+        uint8_t iv[16] = {0};
+
+        aes_hw_set_encrypt_key(kKey, 128, &aes_key);
+
+        CHECK_ABI_SEH(gcm_setiv_avx512, &aes_key, &gcm, iv, sizeof(iv));
+
+        for (size_t blocks : kBlockCounts) {
+          CHECK_ABI_SEH(aes_gcm_encrypt_avx512, &aes_key, &gcm, &gcm.mres, buf,
+                        blocks * 16, buf);
+        }
+        aes_hw_set_decrypt_key(kKey, 128, &aes_key);
+        for (size_t blocks : kBlockCounts) {
+          CHECK_ABI_SEH(aes_gcm_decrypt_avx512, &aes_key, &gcm, &gcm.mres, buf,
+                        blocks * 16, buf);
+        }
+      }
+    }
+#endif // !MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+#endif  // GHASH_ASM_X86_64
+  }
+#endif  // GHASH_ASM_X86 || GHASH_ASM_X86_64
+
+#if defined(GHASH_ASM_ARM)
+  if (gcm_neon_capable()) {
+    CHECK_ABI(gcm_init_neon, Htable, kH);
+    CHECK_ABI(gcm_gmult_neon, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_neon, X, Htable, buf, 16 * blocks);
+    }
+  }
+
+  if (gcm_pmull_capable()) {
+    CHECK_ABI(gcm_init_v8, Htable, kH);
+    CHECK_ABI(gcm_gmult_v8, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_v8, X, Htable, buf, 16 * blocks);
+    }
+  }
+#endif  // GHASH_ASM_ARM
+
+#if defined(OPENSSL_AARCH64) && defined(HW_GCM)
+  if (hwaes_capable() && gcm_pmull_capable()) {
+    static const uint8_t kKey[256/8] = {0};
+    uint8_t iv[16] = {0};
+
+    for (size_t key_bits = 128; key_bits <= 256; key_bits += 64) {
+      AES_KEY aes_key;
+      aes_hw_set_encrypt_key(kKey, key_bits, &aes_key);
+      CHECK_ABI(aes_gcm_enc_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key,
+                Htable);
+      CHECK_ABI(aes_gcm_dec_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key,
+                Htable);
+    }
+  }
+#endif
+
+#if defined(GHASH_ASM_PPC64LE)
+  if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
+    CHECK_ABI(gcm_init_p8, Htable, kH);
+    CHECK_ABI(gcm_gmult_p8, X, Htable);
+    for (size_t blocks : kBlockCounts) {
+      CHECK_ABI(gcm_ghash_p8, X, Htable, buf, 16 * blocks);
+    }
+  }
+#endif  // GHASH_ASM_PPC64LE
+}
+#endif  // SUPPORTS_ABI_TEST && !OPENSSL_NO_ASM && !MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/internal.h
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/internal.h
@@ -0,0 +1,456 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef OPENSSL_HEADER_MODES_INTERNAL_H
+#define OPENSSL_HEADER_MODES_INTERNAL_H
+
+#include <openssl/base.h>
+
+#include <openssl/aes.h>
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../internal.h"
+#include "../cpucap/internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// The maximum permitted number of cipher blocks per data unit in XTS mode.
+// Reference IEEE Std 1619-2018.
+#define XTS_MAX_BLOCKS_PER_DATA_UNIT            (1<<20)
+
+// block128_f is the type of an AES block cipher implementation.
+//
+// Unlike upstream OpenSSL, it and the other functions in this file hard-code
+// |AES_KEY|. It is undefined in C to call a function pointer with anything
+// other than the original type. Thus we either must match |block128_f| to the
+// type signature of |AES_encrypt| and friends or pass in |void*| wrapper
+// functions.
+//
+// These functions are called exclusively with AES, so we use the former.
+typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16],
+                           const AES_KEY *key);
+
+OPENSSL_INLINE void CRYPTO_xor16(uint8_t out[16], const uint8_t a[16],
+                                 const uint8_t b[16]) {
+  // TODO(davidben): Ideally we'd leave this to the compiler, which could use
+  // vector registers, etc. But the compiler doesn't know that |in| and |out|
+  // cannot partially alias. |restrict| is slightly two strict (we allow exact
+  // aliasing), but perhaps in-place could be a separate function?
+  OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
+                        block_cannot_be_evenly_divided_into_crypto_word_t)
+  for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
+    CRYPTO_store_word_le(
+        out + i, CRYPTO_load_word_le(a + i) ^ CRYPTO_load_word_le(b + i));
+  }
+}
+
+
+// CTR.
+
+// ctr128_f is the type of a function that performs CTR-mode encryption.
+typedef void (*ctr128_f)(const uint8_t *in, uint8_t *out, size_t blocks,
+                         const AES_KEY *key, const uint8_t ivec[16]);
+
+// CRYPTO_ctr128_encrypt encrypts (or decrypts, it's the same in CTR mode)
+// |len| bytes from |in| to |out| using |block| in counter mode. There's no
+// requirement that |len| be a multiple of any value and any partial blocks are
+// stored in |ecount_buf| and |*num|, which must be zeroed before the initial
+// call. The counter is a 128-bit, big-endian value in |ivec| and is
+// incremented by this function.
+void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           uint8_t ecount_buf[16], unsigned *num,
+                           block128_f block);
+
+// CRYPTO_ctr128_encrypt_ctr32 acts like |CRYPTO_ctr128_encrypt| but takes
+// |ctr|, a function that performs CTR mode but only deals with the lower 32
+// bits of the counter. This is useful when |ctr| can be an optimised
+// function.
+void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
+                                 const AES_KEY *key, uint8_t ivec[16],
+                                 uint8_t ecount_buf[16], unsigned *num,
+                                 ctr128_f ctr);
+
+
+// GCM.
+//
+// This API differs from the upstream API slightly. The |GCM128_CONTEXT| does
+// not have a |key| pointer that points to the key as upstream's version does.
+// Instead, every function takes a |key| parameter. This way |GCM128_CONTEXT|
+// can be safely copied. Additionally, |gcm_key| is split into a separate
+// struct.
+
+typedef struct { uint64_t hi,lo; } u128;
+
+// gmult_func multiplies |Xi| by the GCM key and writes the result back to
+// |Xi|.
+typedef void (*gmult_func)(uint8_t Xi[16], const u128 Htable[16]);
+
+// ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from
+// |inp|. The result is written back to |Xi| and the |len| argument must be a
+// multiple of 16.
+typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[16],
+                           const uint8_t *inp, size_t len);
+
+typedef struct gcm128_key_st {
+  // |gcm_*_ssse3| require a 16-byte-aligned |Htable| when hashing data, but not
+  // initialization. |GCM128_KEY| is not itself aligned to simplify embedding in
+  // |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16.
+  // TODO(crbug.com/boringssl/604): Revisit this.
+  u128 Htable[16];
+  gmult_func gmult;
+  ghash_func ghash;
+
+  block128_f block;
+
+  // use_hw_gcm_crypt is true if this context should use platform-specific
+  // assembly to process GCM data.
+  unsigned use_hw_gcm_crypt:1;
+} GCM128_KEY;
+
+// GCM128_CONTEXT contains state for a single GCM operation. The structure
+// should be zero-initialized before use.
+typedef struct {
+  // The following 5 names follow names in GCM specification
+  uint8_t Yi[16];
+  uint8_t EKi[16];
+  uint8_t EK0[16];
+  struct {
+    uint64_t aad;
+    uint64_t msg;
+  } len;
+  uint8_t Xi[16];
+
+  // |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
+  // TODO(crbug.com/boringssl/604): Revisit this.
+  alignas(16) GCM128_KEY gcm_key;
+
+  unsigned mres, ares;
+} GCM128_CONTEXT;
+
+typedef struct xts128_context {
+  AES_KEY *key1, *key2;
+  block128_f block1, block2;
+} XTS128_CONTEXT;
+
+typedef struct {
+  union {
+    double align;
+    AES_KEY ks;
+  } ks1, ks2;  // AES key schedules to use
+  XTS128_CONTEXT xts;
+} EVP_AES_XTS_CTX;
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+// crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is
+// used.
+int crypto_gcm_clmul_enabled(void);
+// crypto_gcm_avx512_enabled returns one if the AVX512 VAES + VPCLMULQDQ
+// implementation of GCM is used.
+int crypto_gcm_avx512_enabled(void);
+#endif
+
+// CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to
+// |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware
+// accelerated) functions for performing operations in the GHASH field. If the
+// AVX implementation was used |*out_is_avx| will be true.
+void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
+                       u128 out_table[16], int *out_is_avx,
+                       const uint8_t gcm_key[16]);
+
+// CRYPTO_gcm128_init_key initialises |gcm_key| to use |block| (typically AES)
+// with the given key. |block_is_hwaes| is one if |block| is |aes_hw_encrypt|.
+OPENSSL_EXPORT void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key,
+                                           const AES_KEY *key, block128_f block,
+                                           int block_is_hwaes);
+
+// CRYPTO_gcm128_setiv sets the IV (nonce) for |ctx|. The |key| must be the
+// same key that was passed to |CRYPTO_gcm128_init|.
+OPENSSL_EXPORT void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
+                                        const uint8_t *iv, size_t iv_len);
+
+// CRYPTO_gcm128_aad sets the authenticated data for an instance of GCM.
+// This must be called before and data is encrypted. It returns one on success
+// and zero otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad,
+                                     size_t len);
+
+// CRYPTO_gcm128_encrypt encrypts |len| bytes from |in| to |out|. The |key|
+// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one
+// on success and zero otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+                                         const AES_KEY *key, const uint8_t *in,
+                                         uint8_t *out, size_t len);
+
+// CRYPTO_gcm128_decrypt decrypts |len| bytes from |in| to |out|. The |key|
+// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one
+// on success and zero otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+                                         const AES_KEY *key, const uint8_t *in,
+                                         uint8_t *out, size_t len);
+
+// CRYPTO_gcm128_encrypt_ctr32 encrypts |len| bytes from |in| to |out| using
+// a CTR function that only handles the bottom 32 bits of the nonce, like
+// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was
+// passed to |CRYPTO_gcm128_init|. It returns one on success and zero
+// otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+                                               const AES_KEY *key,
+                                               const uint8_t *in, uint8_t *out,
+                                               size_t len, ctr128_f stream);
+
+// CRYPTO_gcm128_decrypt_ctr32 decrypts |len| bytes from |in| to |out| using
+// a CTR function that only handles the bottom 32 bits of the nonce, like
+// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was
+// passed to |CRYPTO_gcm128_init|. It returns one on success and zero
+// otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+                                               const AES_KEY *key,
+                                               const uint8_t *in, uint8_t *out,
+                                               size_t len, ctr128_f stream);
+
+// CRYPTO_gcm128_finish calculates the authenticator and compares it against
+// |len| bytes of |tag|. It returns one on success and zero otherwise.
+OPENSSL_EXPORT int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag,
+                                        size_t len);
+
+// CRYPTO_gcm128_tag calculates the authenticator and copies it into |tag|.
+// The minimum of |len| and 16 bytes are copied into |tag|.
+OPENSSL_EXPORT void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t *tag,
+                                      size_t len);
+
+
+// GCM assembly.
+
+void gcm_init_nohw(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                    size_t len);
+
+#if !defined(OPENSSL_NO_ASM)
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+#define GCM_FUNCREF
+void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                     size_t len);
+
+// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
+// 16-byte-aligned, but |gcm_init_ssse3| does not.
+void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_ssse3(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_ssse3(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
+                     size_t len);
+
+#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+#define GHASH_ASM_X86_64
+void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_avx(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_avx(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
+                   size_t len);
+#if  !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+void gcm_init_avx512(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_avx512(uint8_t Xi[2], const u128 Htable[16]);
+void gcm_ghash_avx512(uint8_t Xi[2], const u128 Htable[16], const uint8_t *in,
+                      size_t len);
+#endif
+#define HW_GCM
+size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                         const AES_KEY *key, uint8_t ivec[16],
+                         const u128 Htable[16], uint8_t Xi[16]);
+size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                         const AES_KEY *key, uint8_t ivec[16],
+                         const u128 Htable[16], uint8_t Xi[16]);
+void gcm_setiv_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
+                      const uint8_t *iv, size_t ivlen);
+void aes_gcm_encrypt_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
+                            unsigned *pblocklen, const uint8_t *in, size_t len,
+                            uint8_t *out);
+void aes_gcm_decrypt_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
+                            unsigned *pblocklen, const uint8_t *in, size_t len,
+                            uint8_t *out);
+#endif  // OPENSSL_X86_64 && !MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX
+
+#if defined(OPENSSL_X86)
+#define GHASH_ASM_X86
+#endif  // OPENSSL_X86
+
+#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
+
+#define GHASH_ASM_ARM
+#define GCM_FUNCREF
+
+OPENSSL_INLINE int gcm_pmull_capable(void) {
+  return CRYPTO_is_ARMv8_PMULL_capable();
+}
+
+void gcm_init_v8(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_v8(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                  size_t len);
+
+OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
+
+void gcm_init_neon(u128 Htable[16], const uint64_t H[2]);
+void gcm_gmult_neon(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_neon(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
+                    size_t len);
+
+#if defined(OPENSSL_AARCH64)
+#define HW_GCM
+// Note that in the argument list of the following functions,
+// - the length is provided in bits (not bytes)
+// - the order of arguments is different from that of |aesni_gcm_encrypt|.
+
+// These functions are defined in aesv8-gcm-armv8.pl.
+void aes_gcm_enc_kernel(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key,
+                        const u128 Htable[16]);
+void aes_gcm_dec_kernel(const uint8_t *in, uint64_t in_bits, void *out,
+                        void *Xi, uint8_t *ivec, const AES_KEY *key,
+                        const u128 Htable[16]);
+
+// These functions are defined in aesv8-gcm-armv8-unroll8.pl.
+// They take input length in BITS and return number of BYTES processed.
+size_t aesv8_gcm_8x_enc_128(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+size_t aesv8_gcm_8x_dec_128(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+size_t aesv8_gcm_8x_enc_192(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+size_t aesv8_gcm_8x_dec_192(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+size_t aesv8_gcm_8x_enc_256(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+size_t aesv8_gcm_8x_dec_256(const uint8_t *in, size_t bit_len, uint8_t *out,
+                            uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
+                            const u128 Htable[16]);
+#endif
+
+#elif defined(OPENSSL_PPC64LE)
+#define GHASH_ASM_PPC64LE
+#define GCM_FUNCREF
+void gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]);
+void gcm_gmult_p8(uint8_t Xi[16], const u128 Htable[16]);
+void gcm_ghash_p8(uint8_t Xi[16], const u128 Htable[16],
+                  const uint8_t *inp, size_t len);
+#endif
+#endif  // OPENSSL_NO_ASM
+
+
+// CBC.
+
+// cbc128_f is the type of a function that performs CBC-mode encryption.
+typedef void (*cbc128_f)(const uint8_t *in, uint8_t *out, size_t len,
+                         const AES_KEY *key, uint8_t ivec[16], int enc);
+
+// CRYPTO_cbc128_encrypt encrypts |len| bytes from |in| to |out| using the
+// given IV and block cipher in CBC mode. The input need not be a multiple of
+// 128 bits long, but the output will round up to the nearest 128 bit multiple,
+// zero padding the input if needed. The IV will be updated on return.
+void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           block128_f block);
+
+// CRYPTO_cbc128_decrypt decrypts |len| bytes from |in| to |out| using the
+// given IV and block cipher in CBC mode. If |len| is not a multiple of 128
+// bits then only that many bytes will be written, but a multiple of 128 bits
+// is always read from |in|. The IV will be updated on return.
+void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16],
+                           block128_f block);
+
+
+// OFB.
+
+// CRYPTO_ofb128_encrypt encrypts (or decrypts, it's the same with OFB mode)
+// |len| bytes from |in| to |out| using |block| in OFB mode. There's no
+// requirement that |len| be a multiple of any value and any partial blocks are
+// stored in |ivec| and |*num|, the latter must be zero before the initial
+// call.
+void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16], unsigned *num,
+                           block128_f block);
+
+
+// CFB.
+
+// CRYPTO_cfb128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
+// from |in| to |out| using |block| in CFB mode. There's no requirement that
+// |len| be a multiple of any value and any partial blocks are stored in |ivec|
+// and |*num|, the latter must be zero before the initial call.
+void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16], unsigned *num,
+                           int enc, block128_f block);
+
+// CRYPTO_cfb128_8_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
+// from |in| to |out| using |block| in CFB-8 mode. Prior to the first call
+// |num| should be set to zero.
+void CRYPTO_cfb128_8_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             unsigned *num, int enc, block128_f block);
+
+// CRYPTO_cfb128_1_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
+// from |in| to |out| using |block| in CFB-1 mode. Prior to the first call
+// |num| should be set to zero.
+void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits,
+                             const AES_KEY *key, uint8_t ivec[16],
+                             unsigned *num, int enc, block128_f block);
+
+size_t CRYPTO_cts128_encrypt_block(const uint8_t *in, uint8_t *out, size_t len,
+                                   const AES_KEY *key, uint8_t ivec[16],
+                                   block128_f block);
+
+// XTS.
+
+// CRYPTO_xts128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
+// from |in| to |out| using the given IV in XTS mode. There's no requirement
+// that |len| be a multiple of any value.
+size_t CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+                             const uint8_t iv[16], const uint8_t *inp,
+                             uint8_t *out, size_t len, int enc);
+
+// POLYVAL.
+//
+// POLYVAL is a polynomial authenticator that operates over a field very
+// similar to the one that GHASH uses. See
+// https://www.rfc-editor.org/rfc/rfc8452.html#section-3.
+
+struct polyval_ctx {
+  uint8_t S[16];
+  // |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
+  // TODO(crbug.com/boringssl/604): Revisit this.
+  alignas(16) u128 Htable[16];
+  gmult_func gmult;
+  ghash_func ghash;
+};
+
+// CRYPTO_POLYVAL_init initialises |ctx| using |key|.
+void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]);
+
+// CRYPTO_POLYVAL_update_blocks updates the accumulator in |ctx| given the
+// blocks from |in|. Only a whole number of blocks can be processed so |in_len|
+// must be a multiple of 16.
+void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in,
+                                  size_t in_len);
+
+// CRYPTO_POLYVAL_finish writes the accumulator from |ctx| to |out|.
+void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_MODES_INTERNAL_H
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/ofb.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/ofb.c
@@ -0,0 +1,45 @@
+// Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/type_check.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+
+
+OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
+                      ofb_block_cannot_be_divided_into_size_t)
+
+void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
+                           const AES_KEY *key, uint8_t ivec[16], unsigned *num,
+                           block128_f block) {
+  assert(key != NULL && ivec != NULL && num != NULL);
+  assert(len == 0 || (in != NULL && out != NULL));
+
+  unsigned n = *num;
+
+  while (n && len) {
+    *(out++) = *(in++) ^ ivec[n];
+    --len;
+    n = (n + 1) % 16;
+  }
+
+  while (len >= 16) {
+    (*block)(ivec, ivec, key);
+    CRYPTO_xor16(out, in, ivec);
+    len -= 16;
+    out += 16;
+    in += 16;
+    n = 0;
+  }
+  if (len) {
+    (*block)(ivec, ivec, key);
+    while (len--) {
+      out[n] = in[n] ^ ivec[n];
+      ++n;
+    }
+  }
+  *num = n;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/polyval.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/polyval.c
@@ -0,0 +1,79 @@
+// Copyright (c) 2016, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/base.h>
+
+#include <assert.h>
+#include <string.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// byte_reverse reverses the order of the bytes in |b->c|.
+static void byte_reverse(uint8_t b[16]) {
+  uint64_t hi = CRYPTO_load_u64_le(b);
+  uint64_t lo = CRYPTO_load_u64_le(b + 8);
+  CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo));
+  CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi));
+}
+
+// reverse_and_mulX_ghash interprets |b| as a reversed element of the GHASH
+// field, multiplies that by 'x' and serialises the result back into |b|, but
+// with GHASH's backwards bit ordering.
+static void reverse_and_mulX_ghash(uint8_t b[16]) {
+  uint64_t hi = CRYPTO_load_u64_le(b);
+  uint64_t lo = CRYPTO_load_u64_le(b + 8);
+  const crypto_word_t carry = constant_time_eq_w(hi & 1, 1);
+  hi >>= 1;
+  hi |= lo << 63;
+  lo >>= 1;
+  lo ^= ((uint64_t) constant_time_select_w(carry, 0xe1, 0)) << 56;
+
+  CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo));
+  CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi));
+}
+
+// POLYVAL(H, X_1, ..., X_n) =
+// ByteReverse(GHASH(mulX_GHASH(ByteReverse(H)), ByteReverse(X_1), ...,
+// ByteReverse(X_n))).
+//
+// See https://www.rfc-editor.org/rfc/rfc8452.html#appendix-A.
+
+void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]) {
+  alignas(8) uint8_t H[16];
+  OPENSSL_memcpy(H, key, 16);
+  reverse_and_mulX_ghash(H);
+
+  int is_avx;
+  CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, &is_avx, H);
+  OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S));
+}
+
+void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in,
+                                  size_t in_len) {
+  assert((in_len & 15) == 0);
+  alignas(8) uint8_t buf[32 * 16];
+
+  while (in_len > 0) {
+    size_t todo = in_len;
+    if (todo > sizeof(buf)) {
+      todo = sizeof(buf);
+    }
+    OPENSSL_memcpy(buf, in, todo);
+    in += todo;
+    in_len -= todo;
+
+    size_t blocks = todo / 16;
+    for (size_t i = 0; i < blocks; i++) {
+      byte_reverse(buf + 16 * i);
+    }
+
+    ctx->ghash(ctx->S, ctx->Htable, buf, todo);
+  }
+}
+
+void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]) {
+  OPENSSL_memcpy(out, &ctx->S, 16);
+  byte_reverse(out);
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/xts.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/xts.c
@@ -0,0 +1,122 @@
+// Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/evp.h>
+
+#include <string.h>
+
+#include <openssl/aes.h>
+#include <openssl/cipher.h>
+#include <openssl/err.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+size_t CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
+                             const uint8_t iv[16], const uint8_t *inp,
+                             uint8_t *out, size_t len, int enc) {
+  union {
+    uint64_t u[2];
+    uint8_t c[16];
+  } tweak, scratch;
+  unsigned int i;
+
+  if (len < 16) return 0;
+
+  OPENSSL_memcpy(tweak.c, iv, 16);
+
+  (*ctx->block2)(tweak.c, tweak.c, ctx->key2);
+
+  if (!enc && (len % 16)) len -= 16;
+
+  while (len >= 16) {
+    OPENSSL_memcpy(scratch.c, inp, 16);
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    OPENSSL_memcpy(out, scratch.c, 16);
+    inp += 16;
+    out += 16;
+    len -= 16;
+
+    if (len == 0) return 1;
+
+    unsigned int carry, res;
+
+#if defined(OPENSSL_BIG_ENDIAN)
+    uint64_t tweak_u0, tweak_u1;
+    tweak_u0 = CRYPTO_load_u64_le(&tweak.u[0]);
+    tweak_u1 = CRYPTO_load_u64_le(&tweak.u[1]);
+    res = 0x87 & (((int64_t)tweak_u1) >> 63);
+    carry = (unsigned int)(tweak_u0 >> 63);
+    tweak_u0 = (tweak_u0 << 1) ^ res;
+    tweak_u1 = (tweak_u1 << 1) | carry;    
+    CRYPTO_store_u64_le(&tweak.u[0], tweak_u0);
+    CRYPTO_store_u64_le(&tweak.u[1], tweak_u1);
+#else
+    res = 0x87 & (((int64_t)tweak.u[1]) >> 63);
+    carry = (unsigned int)(tweak.u[0] >> 63);
+    tweak.u[0] = (tweak.u[0] << 1) ^ res;
+    tweak.u[1] = (tweak.u[1] << 1) | carry;
+#endif
+  }
+  if (enc) {
+    for (i = 0; i < len; ++i) {
+      uint8_t c = inp[i];
+      out[i] = scratch.c[i];
+      scratch.c[i] = c;
+    }
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    OPENSSL_memcpy(out - 16, scratch.c, 16);
+  } else {
+    union {
+      uint64_t u[2];
+      uint8_t c[16];
+    } tweak1;
+
+    unsigned int carry, res;
+
+#if defined(OPENSSL_BIG_ENDIAN)
+    uint64_t tweak_u0, tweak_u1;
+    tweak_u0 = CRYPTO_load_u64_le(&tweak.u[0]);
+    tweak_u1 = CRYPTO_load_u64_le(&tweak.u[1]);
+    res = 0x87 & (((int64_t)tweak_u1) >> 63);
+    carry = (unsigned int)(tweak_u0 >> 63);
+    tweak_u0 = (tweak_u0 << 1) ^ res;
+    tweak_u1 = (tweak_u1 << 1) | carry;    
+    CRYPTO_store_u64_le(&tweak1.u[0], tweak_u0);
+    CRYPTO_store_u64_le(&tweak1.u[1], tweak_u1);
+#else
+    res = 0x87 & (((int64_t)tweak.u[1]) >> 63);
+    carry = (unsigned int)(tweak.u[0] >> 63);
+    tweak1.u[0] = (tweak.u[0] << 1) ^ res;
+    tweak1.u[1] = (tweak.u[1] << 1) | carry;
+#endif
+    OPENSSL_memcpy(scratch.c, inp, 16);
+    scratch.u[0] ^= tweak1.u[0];
+    scratch.u[1] ^= tweak1.u[1];
+    (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
+    scratch.u[0] ^= tweak1.u[0];
+    scratch.u[1] ^= tweak1.u[1];
+
+    for (i = 0; i < len; ++i) {
+      uint8_t c = inp[16 + i];
+      out[16 + i] = scratch.c[i];
+      scratch.c[i] = c;
+    }
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    (*ctx->block1)(scratch.c, scratch.c, ctx->key1);
+    scratch.u[0] ^= tweak.u[0];
+    scratch.u[1] ^= tweak.u[1];
+    OPENSSL_memcpy(out, scratch.c, 16);
+  }
+
+  return 1;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/xts_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/modes/xts_test.cc