chore: checkpoint before Python removal

2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/keccak1600-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/keccak1600-armv8.pl
@@ -0,0 +1,358 @@
+#!/usr/bin/env perl
+# Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# Keccak-1600 for ARMv8.
+#
+# June 2017.
+#
+# This is straightforward KECCAK_1X_ALT implementation. It makes no
+# sense to attempt SIMD/NEON implementation for following reason.
+# 64-bit lanes of vector registers can't be addressed as easily as in
+# 32-bit mode. This means that 64-bit NEON is bound to be slower than
+# 32-bit NEON, and this implementation is faster than 32-bit NEON on
+# same processor. Even though it takes more scalar xor's and andn's,
+# it gets compensated by availability of rotate. Not to forget that
+# most processors achieve higher issue rate with scalar instructions.
+#
+# February 2018.
+#
+# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
+# variant with register permutation/rotation twist that allows to
+# eliminate copies to temporary registers. If you look closely you'll
+# notice that it uses only one lane of vector registers. The new
+# instructions effectively facilitate parallel hashing, which we don't
+# support [yet?]. But lowest-level core procedure is prepared for it.
+# The inner round is 67 [vector] instructions, so it's not actually
+# obvious that it will provide performance improvement [in serial
+# hash] as long as vector instructions issue rate is limited to 1 per
+# cycle...
+#
+# July 2025
+#
+# Removed SHA3 variant, restricted assembly to core Keccak permutation.
+#
+######################################################################
+# Numbers are cycles per processed byte.
+#
+#		r=1088(*)
+#
+# Cortex-A53	13
+# Cortex-A57	12
+# X-Gene	14
+# Mongoose	10
+# Kryo		12
+# Denver	7.8
+# Apple A7	7.2
+# ThunderX2	9.7
+#
+# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
+#	because they vary too much from compiler to compiler. Newer
+#	compiler does much better and improvement varies from 5% on
+#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
+#	compiler this code is at least 2x faster...
+
+# File keccak1600-armv8.pl is imported from OpenSSL.
+# https://github.com/openssl/openssl/blob/479b9adb88b9050186c1e9fc94879906f378b14b/crypto/sha/asm/keccak1600-armv8.pl
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+my @subrhotates = ([  64,  63, 2, 36, 37 ],
+                [ 28, 20,  58, 9, 44 ],
+                [  61, 54, 21, 39, 25 ],
+                [ 23, 19, 49, 43,  56 ],
+                [ 46,  62, 3, 8, 50 ]);
+
+$code.=<<___;
+#include <openssl/arm_arch.h>
+.text
+.align 8	// strategic alignment and padding that allows to use
+		// address value as loop termination condition...
+	.quad	0,0,0,0,0,0,0,0
+.type	iotas_hw,%object
+iotas_hw:
+	.quad	0x0000000000000001
+	.quad	0x0000000000008082
+	.quad	0x800000000000808a
+	.quad	0x8000000080008000
+	.quad	0x000000000000808b
+	.quad	0x0000000080000001
+	.quad	0x8000000080008081
+	.quad	0x8000000000008009
+	.quad	0x000000000000008a
+	.quad	0x0000000000000088
+	.quad	0x0000000080008009
+	.quad	0x000000008000000a
+	.quad	0x000000008000808b
+	.quad	0x800000000000008b
+	.quad	0x8000000000008089
+	.quad	0x8000000000008003
+	.quad	0x8000000000008002
+	.quad	0x8000000000000080
+	.quad	0x000000000000800a
+	.quad	0x800000008000000a
+	.quad	0x8000000080008081
+	.quad	0x8000000000008080
+	.quad	0x0000000080000001
+	.quad	0x8000000080008008
+.size	iotas_hw,.-iotas_hw
+___
+								{{{
+my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
+            (0, 5, 10, 15, 20));
+   $A[3][3] = "x25"; # x18 is reserved
+
+my @C = map("x$_", (26,27,28,30));
+
+$code.=<<___;
+.type	KeccakF1600_int,%function
+.align	5
+KeccakF1600_int:
+	AARCH64_SIGN_LINK_REGISTER
+	adr	$C[2],iotas_hw
+	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
+	b	.Loop
+.align	4
+.Loop:
+	////////////////////////////////////////// Theta
+	eor	$C[0],$A[0][0],$A[1][0]
+	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
+	eor	$C[1],$A[0][1],$A[1][1]
+	eor	$C[2],$A[0][2],$A[1][2]
+	eor	$C[3],$A[0][3],$A[1][3]
+___
+	$C[4]=$A[0][4];
+	$C[5]=$A[1][4];
+$code.=<<___;
+	eor	$C[4],$A[0][4],$A[1][4]
+	eor	$C[0],$C[0],$A[2][0]
+	eor	$C[1],$C[1],$A[2][1]
+	eor	$C[2],$C[2],$A[2][2]
+	eor	$C[3],$C[3],$A[2][3]
+	eor	$C[4],$C[4],$A[2][4]
+	eor	$C[0],$C[0],$A[3][0]
+	eor	$C[1],$C[1],$A[3][1]
+	eor	$C[2],$C[2],$A[3][2]
+	eor	$C[3],$C[3],$A[3][3]
+	eor	$C[4],$C[4],$A[3][4]
+	eor	$C[0],$C[0],$A[4][0]
+	eor	$C[2],$C[2],$A[4][2]
+	eor	$C[1],$C[1],$A[4][1]
+	eor	$C[3],$C[3],$A[4][3]
+	eor	$C[4],$C[4],$A[4][4]
+	eor	$C[5],$C[0],$C[2],ror#63
+	eor	$A[0][1],$A[0][1],$C[5]
+	eor	$A[1][1],$A[1][1],$C[5]
+	eor	$A[2][1],$A[2][1],$C[5]
+	eor	$A[3][1],$A[3][1],$C[5]
+	eor	$A[4][1],$A[4][1],$C[5]
+	eor	$C[5],$C[1],$C[3],ror#63
+	eor	$C[2],$C[2],$C[4],ror#63
+	eor	$C[3],$C[3],$C[0],ror#63
+	eor	$C[4],$C[4],$C[1],ror#63
+	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
+	eor	$A[1][2],$A[1][2],$C[5]
+	eor	$A[2][2],$A[2][2],$C[5]
+	eor	$A[3][2],$A[3][2],$C[5]
+	eor	$A[4][2],$A[4][2],$C[5]
+	eor	$A[0][0],$A[0][0],$C[4]
+	eor	$A[1][0],$A[1][0],$C[4]
+	eor	$A[2][0],$A[2][0],$C[4]
+	eor	$A[3][0],$A[3][0],$C[4]
+	eor	$A[4][0],$A[4][0],$C[4]
+___
+	$C[4]=undef;
+	$C[5]=undef;
+$code.=<<___;
+	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
+	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
+	eor	$A[1][3],$A[1][3],$C[2]
+	eor	$A[2][3],$A[2][3],$C[2]
+	eor	$A[3][3],$A[3][3],$C[2]
+	eor	$A[4][3],$A[4][3],$C[2]
+	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
+	eor	$A[1][4],$A[1][4],$C[3]
+	eor	$A[2][4],$A[2][4],$C[3]
+	eor	$A[3][4],$A[3][4],$C[3]
+	eor	$A[4][4],$A[4][4],$C[3]
+	////////////////////////////////////////// Rho+Pi
+	mov	$C[3],$A[0][1]
+	ror	$A[0][1],$A[1][1],#$subrhotates[1][1]
+	//mov	$C[1],$A[0][2]
+	ror	$A[0][2],$A[2][2],#$subrhotates[2][2]
+	//mov	$C[0],$A[0][3]
+	ror	$A[0][3],$A[3][3],#$subrhotates[3][3]
+	//mov	$C[2],$A[0][4]
+	ror	$A[0][4],$A[4][4],#$subrhotates[4][4]
+	ror	$A[1][1],$A[1][4],#$subrhotates[1][4]
+	ror	$A[2][2],$A[2][3],#$subrhotates[2][3]
+	ror	$A[3][3],$A[3][2],#$subrhotates[3][2]
+	ror	$A[4][4],$A[4][1],#$subrhotates[4][1]
+	ror	$A[1][4],$A[4][2],#$subrhotates[4][2]
+	ror	$A[2][3],$A[3][4],#$subrhotates[3][4]
+	ror	$A[3][2],$A[2][1],#$subrhotates[2][1]
+	ror	$A[4][1],$A[1][3],#$subrhotates[1][3]
+	ror	$A[4][2],$A[2][4],#$subrhotates[2][4]
+	ror	$A[3][4],$A[4][3],#$subrhotates[4][3]
+	ror	$A[2][1],$A[1][2],#$subrhotates[1][2]
+	ror	$A[1][3],$A[3][1],#$subrhotates[3][1]
+	ror	$A[2][4],$A[4][0],#$subrhotates[4][0]
+	ror	$A[4][3],$A[3][0],#$subrhotates[3][0]
+	ror	$A[1][2],$A[2][0],#$subrhotates[2][0]
+	ror	$A[3][1],$A[1][0],#$subrhotates[1][0]
+	ror	$A[1][0],$C[0],#$subrhotates[0][3]
+	ror	$A[2][0],$C[3],#$subrhotates[0][1]
+	ror	$A[3][0],$C[2],#$subrhotates[0][4]
+	ror	$A[4][0],$C[1],#$subrhotates[0][2]
+	////////////////////////////////////////// Chi+Iota
+	bic	$C[0],$A[0][2],$A[0][1]
+	bic	$C[1],$A[0][3],$A[0][2]
+	bic	$C[2],$A[0][0],$A[0][4]
+	bic	$C[3],$A[0][1],$A[0][0]
+	eor	$A[0][0],$A[0][0],$C[0]
+	bic	$C[0],$A[0][4],$A[0][3]
+	eor	$A[0][1],$A[0][1],$C[1]
+	 ldr	$C[1],[sp,#16]
+	eor	$A[0][3],$A[0][3],$C[2]
+	eor	$A[0][4],$A[0][4],$C[3]
+	eor	$A[0][2],$A[0][2],$C[0]
+	 ldr	$C[3],[$C[1]],#8		// Iota[i++]
+	bic	$C[0],$A[1][2],$A[1][1]
+	 tst	$C[1],#255			// are we done?
+	 str	$C[1],[sp,#16]
+	bic	$C[1],$A[1][3],$A[1][2]
+	bic	$C[2],$A[1][0],$A[1][4]
+	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
+	bic	$C[3],$A[1][1],$A[1][0]
+	eor	$A[1][0],$A[1][0],$C[0]
+	bic	$C[0],$A[1][4],$A[1][3]
+	eor	$A[1][1],$A[1][1],$C[1]
+	eor	$A[1][3],$A[1][3],$C[2]
+	eor	$A[1][4],$A[1][4],$C[3]
+	eor	$A[1][2],$A[1][2],$C[0]
+	bic	$C[0],$A[2][2],$A[2][1]
+	bic	$C[1],$A[2][3],$A[2][2]
+	bic	$C[2],$A[2][0],$A[2][4]
+	bic	$C[3],$A[2][1],$A[2][0]
+	eor	$A[2][0],$A[2][0],$C[0]
+	bic	$C[0],$A[2][4],$A[2][3]
+	eor	$A[2][1],$A[2][1],$C[1]
+	eor	$A[2][3],$A[2][3],$C[2]
+	eor	$A[2][4],$A[2][4],$C[3]
+	eor	$A[2][2],$A[2][2],$C[0]
+	bic	$C[0],$A[3][2],$A[3][1]
+	bic	$C[1],$A[3][3],$A[3][2]
+	bic	$C[2],$A[3][0],$A[3][4]
+	bic	$C[3],$A[3][1],$A[3][0]
+	eor	$A[3][0],$A[3][0],$C[0]
+	bic	$C[0],$A[3][4],$A[3][3]
+	eor	$A[3][1],$A[3][1],$C[1]
+	eor	$A[3][3],$A[3][3],$C[2]
+	eor	$A[3][4],$A[3][4],$C[3]
+	eor	$A[3][2],$A[3][2],$C[0]
+	bic	$C[0],$A[4][2],$A[4][1]
+	bic	$C[1],$A[4][3],$A[4][2]
+	bic	$C[2],$A[4][0],$A[4][4]
+	bic	$C[3],$A[4][1],$A[4][0]
+	eor	$A[4][0],$A[4][0],$C[0]
+	bic	$C[0],$A[4][4],$A[4][3]
+	eor	$A[4][1],$A[4][1],$C[1]
+	eor	$A[4][3],$A[4][3],$C[2]
+	eor	$A[4][4],$A[4][4],$C[3]
+	eor	$A[4][2],$A[4][2],$C[0]
+	bne	.Loop
+	ldr	x30,[sp,#24]
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	KeccakF1600_int,.-KeccakF1600_int
+.globl	KeccakF1600_hw
+.type	KeccakF1600_hw,%function
+.align	5
+KeccakF1600_hw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#48
+	str	x0,[sp,#32]			// offload argument
+	mov	$C[0],x0
+	ldp	$A[0][0],$A[0][1],[x0,#16*0]
+	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
+	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
+	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
+	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
+	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
+	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
+	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
+	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
+	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
+	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
+	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
+	ldr	$A[4][4],[$C[0],#16*12]
+	bl	KeccakF1600_int
+	ldr	$C[0],[sp,#32]
+	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
+	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
+	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
+	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
+	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
+	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
+	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
+	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
+	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
+	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
+	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
+	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
+	str	$A[4][4],[$C[0],#16*12]
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#48
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	KeccakF1600_hw,.-KeccakF1600_hw
+___
+								}}}
+$code.=<<___;
+.asciz	"Keccak-1600 permutation for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-586.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-armv4-large.pl
@@ -0,0 +1,718 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# sha1_block procedure for ARMv4.
+#
+# January 2007.
+
+# Size/performance trade-off
+# ====================================================================
+# impl		size in bytes	comp cycles[*]	measured performance
+# ====================================================================
+# thumb		304		3212		4420
+# armv4-small	392/+29%	1958/+64%	2250/+96%
+# armv4-compact	740/+89%	1552/+26%	1840/+22%
+# armv4-large	1420/+92%	1307/+19%	1370/+34%[***]
+# full unroll	~5100/+260%	~1260/+4%	~1300/+5%
+# ====================================================================
+# thumb		= same as 'small' but in Thumb instructions[**] and
+#		  with recurring code in two private functions;
+# small		= detached Xload/update, loops are folded;
+# compact	= detached Xload/update, 5x unroll;
+# large		= interleaved Xload/update, 5x unroll;
+# full unroll	= interleaved Xload/update, full unroll, estimated[!];
+#
+# [*]	Manually counted instructions in "grand" loop body. Measured
+#	performance is affected by prologue and epilogue overhead,
+#	i-cache availability, branch penalties, etc.
+# [**]	While each Thumb instruction is twice smaller, they are not as
+#	diverse as ARM ones: e.g., there are only two arithmetic
+#	instructions with 3 arguments, no [fixed] rotate, addressing
+#	modes are limited. As result it takes more instructions to do
+#	the same job in Thumb, therefore the code is never twice as
+#	small and always slower.
+# [***]	which is also ~35% better than compiler generated code. Dual-
+#	issue Cortex A8 core was measured to process input block in
+#	~990 cycles.
+
+# August 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 13% improvement on
+# Cortex A8 core and in absolute terms ~870 cycles per input block
+# [or 13.6 cycles per byte].
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
+
+# September 2013.
+#
+# Add NEON implementation (see sha1-586.pl for background info). On
+# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
+# faster than integer-only code. Because [fully unrolled] NEON code
+# is ~2.5x larger and there are some redundant instructions executed
+# when processing last block, improvement is not as big for smallest
+# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
+
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$ctx="r0";
+$inp="r1";
+$len="r2";
+$a="r3";
+$b="r4";
+$c="r5";
+$d="r6";
+$e="r7";
+$K="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+$Xi="r14";
+@V=($a,$b,$c,$d,$e);
+
+sub Xupdate {
+my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
+$code.=<<___;
+	ldr	$t0,[$Xi,#15*4]
+	ldr	$t1,[$Xi,#13*4]
+	ldr	$t2,[$Xi,#7*4]
+	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
+	ldr	$t3,[$Xi,#2*4]
+	eor	$t0,$t0,$t1
+	eor	$t2,$t2,$t3			@ 1 cycle stall
+	eor	$t1,$c,$d			@ F_xx_xx
+	mov	$t0,$t0,ror#31
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
+	$opt1					@ F_xx_xx
+	$opt2					@ F_xx_xx
+	add	$e,$e,$t0			@ E+=X[i]
+___
+}
+
+sub BODY_00_15 {
+my ($a,$b,$c,$d,$e)=@_;
+$code.=<<___;
+#if __ARM_ARCH<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
+	eor	$t1,$c,$d			@ F_xx_xx
+	orr	$t0,$t0,$t2,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
+	eor	$t1,$c,$d			@ F_xx_xx
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
+	and	$t1,$b,$t1,ror#2
+	add	$e,$e,$t0			@ E+=X[i]
+	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
+	str	$t0,[$Xi,#-4]!
+	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_16_19 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"and $t1,$b,$t1,ror#2");
+$code.=<<___;
+	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
+	add	$e,$e,$t1			@ E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
+$code.=<<___;
+	add	$e,$e,$t1			@ E+=F_20_39(B,C,D)
+___
+}
+
+sub BODY_40_59 {
+my ($a,$b,$c,$d,$e)=@_;
+	&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
+$code.=<<___;
+	add	$e,$e,$t1			@ E+=F_40_59(B,C,D)
+	add	$e,$e,$t2,ror#2
+___
+}
+
+$code=<<___;
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.global	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
+
+.align	5
+sha1_block_data_order_nohw:
+	stmdb	sp!,{r4-r12,lr}
+	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
+	ldmia	$ctx,{$a,$b,$c,$d,$e}
+.Lloop:
+	ldr	$K,.LK_00_19
+	mov	$Xi,sp
+	sub	sp,sp,#15*4
+	mov	$c,$c,ror#30
+	mov	$d,$d,ror#30
+	mov	$e,$e,ror#30		@ [6]
+.L_00_15:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_00_15(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+#if defined(__thumb2__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
+	teq	$Xi,sp
+#endif
+	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+___
+	&BODY_00_15(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+	&BODY_16_19(@V);	unshift(@V,pop(@V));
+$code.=<<___;
+
+	ldr	$K,.LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_20_39(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+#if defined(__thumb2__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
+	teq	$Xi,sp			@ preserve carry
+#endif
+	bne	.L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	.L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	$K,.LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+.L_40_59:
+___
+for($i=0;$i<5;$i++) {
+	&BODY_40_59(@V);	unshift(@V,pop(@V));
+}
+$code.=<<___;
+#if defined(__thumb2__)
+	mov	$t3,sp
+	teq	$Xi,$t3
+#else
+	teq	$Xi,sp
+#endif
+	bne	.L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	$K,.LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	.L_20_39_or_60_79	@ [+4], spare 300 bytes
+.L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	$ctx,{$K,$t0,$t1,$t2,$t3}
+	add	$a,$K,$a
+	add	$b,$t0,$b
+	add	$c,$t1,$c,ror#2
+	add	$d,$t2,$d,ror#2
+	add	$e,$t3,$e,ror#2
+	stmia	$ctx,{$a,$b,$c,$d,$e}
+	teq	$inp,$len
+	bne	.Lloop			@ [+18], total 1307
+
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+
+.align	5
+.LK_00_19:	.word	0x5a827999
+.LK_20_39:	.word	0x6ed9eba1
+.LK_40_59:	.word	0x8f1bbcdc
+.LK_60_79:	.word	0xca62c1d6
+.asciz	"SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+___
+#####################################################################
+# NEON stuff
+#
+{{{
+my @V=($a,$b,$c,$d,$e);
+my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
+my $Xi=4;
+my @X=map("q$_",(8..11,0..3));
+my @Tx=("q12","q13");
+my ($K,$zero)=("q14","q15");
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&bic	($t0,$d,$b)',
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&and	($t1,$c,$b)',
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&eor	($t1,$t1,$t0)',		# F_00_19
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_00_19
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&eor	($t0,$b,$d)',
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
+	'&eor	($t1,$t0,$c)',		# F_20_39
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_20_39
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.		# '$code.="@ $j\n";'.
+	'&add	($e,$e,$Ki)',		# e+=X[i]+K
+	'&and	($t0,$c,$d)',
+	'&ldr	($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
+	'&add	($e,$e,$a,"ror#27")',	# e+=ROR(A,27)
+	'&eor	($t1,$c,$d)',
+	'&add	($e,$e,$t0)',
+	'&and	($t1,$t1,$b)',
+	'&mov	($b,$b,"ror#2")',	# b=ROR(b,2)
+	'&add	($e,$e,$t1);'.		# e+=F_40_59
+	'$j++;	unshift(@V,pop(@V));'
+	)
+}
+
+sub Xupdate_16_31 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vext_8		(@X[0],@X[-4&7],@X[-3&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
+	 eval(shift(@insns));
+	&vext_8		(@Tx[0],@X[-1&7],$zero,4);	# "X[-3]", 3 words
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[0]"^="X[-3]"^"X[-8]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
+	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		(@Tx[1],$zero,@Tx[0],4);	# "X[0]"<<96, extract one dword
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@Tx[0],@Tx[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsri_32	(@X[0],@Tx[0],31);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	(@Tx[0],@Tx[1],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshl_u32	(@Tx[1],@Tx[1],2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@Tx[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_32_79 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vext_8		(@Tx[0],@X[-2&7],@X[-1&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	  &vld1_32	("{$K\[]}","[$K_XX_XX,:32]!")	if ($Xi%5==0);
+	 eval(shift(@insns));
+	&veor		(@Tx[0],@Tx[0],@X[0]);		# "X[-6]"^="X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	(@X[0],@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vst1_32	("{@Tx[1]}","[$Xfer,:128]!");	# X[]+K xfer
+	  &sub		($Xfer,$Xfer,64)		if ($Xi%4==0);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	(@X[0],@Tx[0],2);		# "X[0]"="X[-6]"<<<2
+
+	foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_80 ()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vadd_i32	(@Tx[1],@X[-1&7],$K);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vst1_32	("{@Tx[1]}","[$Xfer,:128]!");
+	&sub		($Xfer,$Xfer,64);
+
+	&teq		($inp,$len);
+	&sub		($K_XX_XX,$K_XX_XX,16);	# rewind $K_XX_XX
+	&it		("eq");
+	&subeq		($inp,$inp,64);		# reload last block to avoid SEGV
+	&vld1_8		("{@X[-4&7]-@X[-3&7]}","[$inp]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_8		("{@X[-2&7]-@X[-1&7]}","[$inp]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$K\[]}","[$K_XX_XX,:32]!");	# load K_00_19
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[-4&7],@X[-4&7]);
+
+	foreach (@insns) { eval; }		# remaining instructions
+
+   $Xi=0;
+}
+
+sub Xloop()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e);
+
+	&vrev32_8	(@X[($Xi-3)&7],@X[($Xi-3)&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[$Xi&7],@X[($Xi-4)&7],$K);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vst1_32	("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
+
+	foreach (@insns) { eval; }
+
+  $Xi++;
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha1_block_data_order_neon
+.type	sha1_block_data_order_neon,%function
+.align	4
+sha1_block_data_order_neon:
+	stmdb	sp!,{r4-r12,lr}
+	add	$len,$inp,$len,lsl#6	@ $len to point at the end of $inp
+	@ dmb				@ errata #451034 on early Cortex A8
+	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
+	mov	$saved_sp,sp
+	sub	$Xfer,sp,#64
+	adr	$K_XX_XX,.LK_00_19
+	bic	$Xfer,$Xfer,#15		@ align for 128-bit stores
+
+	ldmia	$ctx,{$a,$b,$c,$d,$e}	@ load context
+	mov	sp,$Xfer		@ alloca
+
+	vld1.8		{@X[-4&7]-@X[-3&7]},[$inp]!	@ handles unaligned
+	veor		$zero,$zero,$zero
+	vld1.8		{@X[-2&7]-@X[-1&7]},[$inp]!
+	vld1.32		{${K}\[]},[$K_XX_XX,:32]!	@ load K_00_19
+	vrev32.8	@X[-4&7],@X[-4&7]		@ yes, even on
+	vrev32.8	@X[-3&7],@X[-3&7]		@ big-endian...
+	vrev32.8	@X[-2&7],@X[-2&7]
+	vadd.i32	@X[0],@X[-4&7],$K
+	vrev32.8	@X[-1&7],@X[-1&7]
+	vadd.i32	@X[1],@X[-3&7],$K
+	vst1.32		{@X[0]},[$Xfer,:128]!
+	vadd.i32	@X[2],@X[-2&7],$K
+	vst1.32		{@X[1]},[$Xfer,:128]!
+	vst1.32		{@X[2]},[$Xfer,:128]!
+	ldr		$Ki,[sp]			@ big RAW stall
+
+.Loop_neon:
+___
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_16_31(\&body_00_19);
+	&Xupdate_32_79(\&body_00_19);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_20_39);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_40_59);
+	&Xupdate_32_79(\&body_20_39);
+	&Xuplast_80(\&body_20_39);
+	&Xloop(\&body_20_39);
+	&Xloop(\&body_20_39);
+	&Xloop(\&body_20_39);
+$code.=<<___;
+	ldmia	$ctx,{$Ki,$t0,$t1,$Xfer}	@ accumulate context
+	add	$a,$a,$Ki
+	ldr	$Ki,[$ctx,#16]
+	add	$b,$b,$t0
+	add	$c,$c,$t1
+	add	$d,$d,$Xfer
+	it	eq
+	moveq	sp,$saved_sp
+	add	$e,$e,$Ki
+	it	ne
+	ldrne	$Ki,[sp]
+	stmia	$ctx,{$a,$b,$c,$d,$e}
+	itt	ne
+	addne	$Xfer,sp,#3*16
+	bne	.Loop_neon
+
+	@ vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r12,pc}
+.size	sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+___
+}}}
+#####################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
+my @MSG=map("q$_",(4..7));
+my @Kxx=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
+# endif
+
+.global	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
+.align	5
+sha1_block_data_order_hw:
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	veor	$E,$E,$E
+	adr	r3,.LK_00_19
+	vld1.32	{$ABCD},[$ctx]!
+	vld1.32	{$E\[0]},[$ctx]
+	sub	$ctx,$ctx,#16
+	vld1.32	{@Kxx[0]\[]},[r3,:32]!
+	vld1.32	{@Kxx[1]\[]},[r3,:32]!
+	vld1.32	{@Kxx[2]\[]},[r3,:32]!
+	vld1.32	{@Kxx[3]\[]},[r3,:32]
+
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+
+	vadd.i32	$W0,@Kxx[0],@MSG[0]
+	vrev32.8	@MSG[2],@MSG[2]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	subs		$len,$len,#1
+
+	vadd.i32	$W1,@Kxx[0],@MSG[1]
+	vrev32.8	@MSG[3],@MSG[3]
+	sha1h		$E1,$ABCD		@ 0
+	sha1c		$ABCD,$E,$W0
+	vadd.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0		@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h		$E0,$ABCD		@ $i
+	sha1$f		$ABCD,$E1,$W1
+	vadd.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1		@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0		@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);	($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));	$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h		$E0,$ABCD		@ $i
+	sha1p		$ABCD,$E1,$W1
+	vadd.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h		$E1,$ABCD		@ 18
+	sha1p		$ABCD,$E0,$W0
+
+	sha1h		$E0,$ABCD		@ 19
+	sha1p		$ABCD,$E1,$W1
+
+	vadd.i32	$E,$E,$E0
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD},[$ctx]!
+	vst1.32		{$E\[0]},[$ctx]
+
+	vldmia	sp!,{d8-d15}
+	ret					@ bx lr
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+#endif
+___
+}}}
+
+{   my  %opcode = (
+	"sha1c"		=> 0xf2000c40,	"sha1p"		=> 0xf2100c40,
+	"sha1m"		=> 0xf2200c40,	"sha1su0"	=> 0xf2300c40,
+	"sha1h"		=> 0xf3b902c0,	"sha1su1"	=> 0xf3ba0380	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+
+	    # this fix-up provides Thumb encoding in conjunction with INST
+	    $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+	s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo	or
+	s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
+
+	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
+
+	s/\bret\b/bx	lr/o		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4
+
+	print $_,$/;
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-armv8.pl
@@ -0,0 +1,349 @@
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		hardware-assisted	software(*)
+# Apple A7	2.31			4.13 (+14%)
+# Cortex-A53	2.24			8.03 (+97%)
+# Cortex-A57	2.35			7.88 (+74%)
+# Denver	2.13			3.97 (+0%)(**)
+# X-Gene				8.80 (+200%)
+# Mongoose	2.05			6.50 (+160%)
+# Kryo		1.88			8.00 (+90%)
+#
+# (*)	Software results are presented mostly for reference purposes.
+# (**)	Keep in mind that Denver relies on binary translation, which
+#	optimizes compiler output at run-time.
+
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+	lsr	@Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef	__AARCH64EB__
+	ror	@Xx[$i+1],@Xx[$i+1],#32
+#else
+	rev32	@Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+	movz	$K,#0xeba1
+	movk	$K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+	movz	$K,#0xc1d6
+	movk	$K,#0xca62,lsl#16
+___
+$code.=<<___;
+	orr	$t0,$b,$c
+	and	$t1,$b,$c
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	ror	$t2,$a,#27
+	and	$t0,$t0,$d
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$e,$e,$t2		// e+=rot(a,5)
+	orr	$t0,$t0,$t1
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+	movz	$K,#0xbcdc
+	movk	$K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+	ldp	@Xw[1],@Xw[2],[$ctx]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+	ldp	@Xw[3],@Xw[4],[$ctx,#8]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	ldr	@Xw[5],[$ctx,#16]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	sha1_block_data_order_nohw
+.type	sha1_block_data_order_nohw,%function
+.align	6
+sha1_block_data_order_nohw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldr	$E,[$ctx,#16]
+
+.Loop:
+	ldr	@Xx[0],[$inp],#64
+	movz	$K,#0x7999
+	sub	$num,$num,#1
+	movk	$K,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	$Xx[0],@Xx[0],#32
+#else
+	rev32	@Xx[0],@Xx[0]
+#endif
+	add	$E,$E,$K		// warm it up
+	add	$E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	add	$B,$B,@Xw[2]
+	add	$C,$C,@Xw[3]
+	add	$A,$A,@Xw[1]
+	add	$D,$D,@Xw[4]
+	add	$E,$E,@Xw[5]
+	stp	$A,$B,[$ctx]
+	stp	$C,$D,[$ctx,#8]
+	str	$E,[$ctx,#16]
+	cbnz	$num,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.globl	sha1_block_data_order_hw
+.type	sha1_block_data_order_hw,%function
+.align	6
+sha1_block_data_order_hw:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,:pg_hi21:.Lconst
+	add	x4,x4,:lo12:.Lconst
+	eor	$E,$E,$E
+	ld1.32	{$ABCD},[$ctx],#16
+	ld1.32	{$E}[0],[$ctx]
+	sub	$ctx,$ctx,#16
+	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
+	sub	$num,$num,#1
+	rev32	@MSG[0],@MSG[0]
+	rev32	@MSG[1],@MSG[1]
+
+	add.i32	$W0,@Kxx[0],@MSG[0]
+	rev32	@MSG[2],@MSG[2]
+	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
+
+	add.i32	$W1,@Kxx[0],@MSG[1]
+	rev32	@MSG[3],@MSG[3]
+	sha1h	$E1,$ABCD
+	sha1c	$ABCD,$E,$W0		// 0
+	add.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0	@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1$f	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1	@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0	@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1p	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h	$E1,$ABCD		// 18
+	sha1p	$ABCD,$E0,$W0
+
+	sha1h	$E0,$ABCD		// 19
+	sha1p	$ABCD,$E1,$W1
+
+	add.i32	$E,$E,$E0
+	add.i32	$ABCD,$ABCD,$ABCD_SAVE
+
+	cbnz	$num,.Loop_hw
+
+	st1.32	{$ABCD},[$ctx],#16
+	st1.32	{$E}[0],[$ctx]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
+.section .rodata
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+}}}
+
+{   my	%opcode = (
+	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
+	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
+	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha256-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha256-586.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha256-armv4.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha256-armv4.pl
@@ -0,0 +1,744 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$ctx="r0";	$t0="r0";
+$inp="r1";	$t4="r1";
+$len="r2";	$t1="r2";
+$T1="r3";	$t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH>=7
+	@ ldr	$t1,[$inp],#4			@ $i
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	$t1,$t1
+# endif
+#else
+	@ ldrb	$t1,[$inp,#3]			@ $i
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	ldrb	$t2,[$inp,#2]
+	ldrb	$t0,[$inp,#1]
+	orr	$t1,$t1,$t2,lsl#8
+	ldrb	$t2,[$inp],#4
+	orr	$t1,$t1,$t0,lsl#16
+# if $i==15
+	str	$inp,[sp,#17*4]			@ make room for $t4
+# endif
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+	orr	$t1,$t1,$t2,lsl#24
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+#endif
+___
+$code.=<<___;
+	ldr	$t2,[$Ktbl],#4			@ *K256++
+	add	$h,$h,$t1			@ h+=X[i]
+	str	$t1,[sp,#`$i%16`*4]
+	eor	$t1,$f,$g
+	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
+	and	$t1,$t1,$e
+	add	$h,$h,$t2			@ h+=K256[i]
+	eor	$t1,$t1,$g			@ Ch(e,f,g)
+	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+	add	$h,$h,$t1			@ h+=Ch(e,f,g)
+#if $i==31
+	and	$t2,$t2,#0xff
+	cmp	$t2,#0xf2			@ done?
+#endif
+#if $i<15
+# if __ARM_ARCH>=7
+	ldr	$t1,[$inp],#4			@ prefetch
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+#else
+	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
+	eor	$t2,$a,$b			@ a^b, b^c in next round
+	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
+#endif
+	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
+	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
+	add	$d,$d,$h			@ d+=h
+	eor	$t3,$t3,$b			@ Maj(a,b,c)
+	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
+	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t4,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t1,ror#$sigma0[0]
+	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
+	mov	$t2,$t4,ror#$sigma1[0]
+	eor	$t0,$t0,$t1,ror#$sigma0[1]
+	eor	$t2,$t2,$t4,ror#$sigma1[1]
+	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	ldr	$t1,[sp,#`($i+0)%16`*4]
+	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	ldr	$t4,[sp,#`($i+9)%16`*4]
+
+	add	$t2,$t2,$t0
+	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
+	add	$t1,$t1,$t2
+	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
+	add	$t1,$t1,$t4			@ X[i]
+___
+	&BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch  armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code   32
+#endif
+
+.type	K256,%object
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size	K256,.-K256
+.word	0				@ terminator
+.align	5
+
+.global	sha256_block_data_order_nohw
+.type	sha256_block_data_order_nohw,%function
+sha256_block_data_order_nohw:
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
+	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+	adr	$Ktbl,K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+.Loop:
+# if __ARM_ARCH>=7
+	ldr	$t1,[$inp],#4
+# else
+	ldrb	$t1,[$inp,#3]
+# endif
+	eor	$t3,$B,$C		@ magic
+	eor	$t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t3,[sp,#16*4]		@ pull ctx
+	bne	.Lrounds_16_xx
+
+	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
+	ldr	$t0,[$t3,#0]
+	ldr	$t1,[$t3,#4]
+	ldr	$t2,[$t3,#8]
+	add	$A,$A,$t0
+	ldr	$t0,[$t3,#12]
+	add	$B,$B,$t1
+	ldr	$t1,[$t3,#16]
+	add	$C,$C,$t2
+	ldr	$t2,[$t3,#20]
+	add	$D,$D,$t0
+	ldr	$t0,[$t3,#24]
+	add	$E,$E,$t1
+	ldr	$t1,[$t3,#28]
+	add	$F,$F,$t2
+	ldr	$inp,[sp,#17*4]		@ pull inp
+	ldr	$t2,[sp,#18*4]		@ pull inp+len
+	add	$G,$G,$t0
+	add	$H,$H,$t1
+	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+	cmp	$inp,$t2
+	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
+	bne	.Loop
+
+	add	sp,sp,#`16+3`*4	@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T2,$T0,$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T1,$T0,$sigma0[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T2,$T0,32-$sigma0[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vshr_u32	($T3,$T0,$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T2);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vsli_32	($T3,$T0,32-$sigma0[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 while($#insns>=2) { eval(shift(@insns)); }
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vld1_32	("{$T0}","[$Ktbl,:128]!");
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vrev32_8	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vadd_i32	($T0,$T0,@X[0]);
+	 foreach (@insns) { eval; }	# remaining instructions
+	&vst1_32	("{$T0}","[$Xfer,:128]!");
+
+	push(@X,shift(@X));		# "rotate" X[]
+}
+
+sub body_00_15 () {
+	(
+	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
+	'&eor	($t1,$f,$g)',
+	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
+	'&and	($t1,$t1,$e)',
+	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
+	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
+	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
+	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
+	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
+	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
+	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
+	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
+	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
+	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
+	'&add	($d,$d,$h)',			# d+=h
+	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
+	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
+	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+	)
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.LK256_shortcut_neon:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_neon+4)
+#else
+.word	K256-(.LK256_add_neon+8)
+#endif
+
+.global	sha256_block_data_order_neon
+.type	sha256_block_data_order_neon,%function
+.align	5
+.skip	16
+sha256_block_data_order_neon:
+	stmdb	sp!,{r4-r12,lr}
+
+	sub	$H,sp,#16*4+16
+
+	@ K256 is just at the boundary of being easily referenced by an ADR from
+	@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
+	@ not fit. By moving code around, we could make it fit, but this is too
+	@ fragile. For simplicity, just load the offset from
+	@ .LK256_shortcut_neon.
+	@
+	@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
+	@ support it. We might be able to emulate it with a macro, but Android's
+	@ did not work when I tried it.
+	@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
+	ldr	$Ktbl,.LK256_shortcut_neon
+.LK256_add_neon:
+	add	$Ktbl,pc,$Ktbl
+
+	bic	$H,$H,#15		@ align for 128-bit stores
+	mov	$t2,sp
+	mov	sp,$H			@ alloca
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+
+	vld1.8		{@X[0]},[$inp]!
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	vld1.32		{$T0},[$Ktbl,:128]!
+	vld1.32		{$T1},[$Ktbl,:128]!
+	vld1.32		{$T2},[$Ktbl,:128]!
+	vld1.32		{$T3},[$Ktbl,:128]!
+	vrev32.8	@X[0],@X[0]		@ yes, even on
+	str		$ctx,[sp,#64]
+	vrev32.8	@X[1],@X[1]		@ big-endian
+	str		$inp,[sp,#68]
+	mov		$Xfer,sp
+	vrev32.8	@X[2],@X[2]
+	str		$len,[sp,#72]
+	vrev32.8	@X[3],@X[3]
+	str		$t2,[sp,#76]		@ save original sp
+	vadd.i32	$T0,$T0,@X[0]
+	vadd.i32	$T1,$T1,@X[1]
+	vst1.32		{$T0},[$Xfer,:128]!
+	vadd.i32	$T2,$T2,@X[2]
+	vst1.32		{$T1},[$Xfer,:128]!
+	vadd.i32	$T3,$T3,@X[3]
+	vst1.32		{$T2},[$Xfer,:128]!
+	vst1.32		{$T3},[$Xfer,:128]!
+
+	ldmia		$ctx,{$A-$H}
+	sub		$Xfer,$Xfer,#64
+	ldr		$t1,[sp,#0]
+	eor		$t2,$t2,$t2
+	eor		$t3,$B,$C
+	b		.L_00_48
+
+.align	4
+.L_00_48:
+___
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+	&Xupdate(\&body_00_15);
+$code.=<<___;
+	teq	$t1,#0				@ check for K256 terminator
+	ldr	$t1,[sp,#0]
+	sub	$Xfer,$Xfer,#64
+	bne	.L_00_48
+
+	ldr		$inp,[sp,#68]
+	ldr		$t0,[sp,#72]
+	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
+	teq		$inp,$t0
+	it		eq
+	subeq		$inp,$inp,#64		@ avoid SEGV
+	vld1.8		{@X[0]},[$inp]!		@ load next input block
+	vld1.8		{@X[1]},[$inp]!
+	vld1.8		{@X[2]},[$inp]!
+	vld1.8		{@X[3]},[$inp]!
+	it		ne
+	strne		$inp,[sp,#68]
+	mov		$Xfer,sp
+___
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+	&Xpreload(\&body_00_15);
+$code.=<<___;
+	ldr	$t0,[$t1,#0]
+	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
+	ldr	$t2,[$t1,#4]
+	ldr	$t3,[$t1,#8]
+	ldr	$t4,[$t1,#12]
+	add	$A,$A,$t0			@ accumulate
+	ldr	$t0,[$t1,#16]
+	add	$B,$B,$t2
+	ldr	$t2,[$t1,#20]
+	add	$C,$C,$t3
+	ldr	$t3,[$t1,#24]
+	add	$D,$D,$t4
+	ldr	$t4,[$t1,#28]
+	add	$E,$E,$t0
+	str	$A,[$t1],#4
+	add	$F,$F,$t2
+	str	$B,[$t1],#4
+	add	$G,$G,$t3
+	str	$C,[$t1],#4
+	add	$H,$H,$t4
+	str	$D,[$t1],#4
+	stmia	$t1,{$E-$H}
+
+	ittte	ne
+	movne	$Xfer,sp
+	ldrne	$t1,[sp,#0]
+	eorne	$t2,$t2,$t2
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	$t3,$B,$C
+	bne	.L_00_48
+
+	ldmia	sp!,{r4-r12,pc}
+.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+.LK256_shortcut_hw:
+@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
+#if defined(__thumb2__)
+.word	K256-(.LK256_add_hw+4)
+#else
+.word	K256-(.LK256_add_hw+8)
+#endif
+
+.global	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	5
+sha256_block_data_order_hw:
+	@ K256 is too far to reference from one ADR command in Thumb mode. In
+	@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
+	@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
+	ldr	$Ktbl,.LK256_shortcut_hw
+.LK256_add_hw:
+	add	$Ktbl,pc,$Ktbl
+
+	vld1.32	{$ABCD,$EFGH},[$ctx]
+	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
+	b	.Loop_v8
+
+.align	4
+.Loop_v8:
+	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
+	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
+	vld1.32		{$W0},[$Ktbl]!
+	vrev32.8	@MSG[0],@MSG[0]
+	vrev32.8	@MSG[1],@MSG[1]
+	vrev32.8	@MSG[2],@MSG[2]
+	vrev32.8	@MSG[3],@MSG[3]
+	vmov		$ABCD_SAVE,$ABCD	@ offload
+	vmov		$EFGH_SAVE,$EFGH
+	teq		$inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	vld1.32		{$W1},[$Ktbl]!
+	vadd.i32	$W0,$W0,@MSG[0]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vld1.32		{$W0},[$Ktbl]!
+	vadd.i32	$W1,$W1,@MSG[1]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vld1.32		{$W1},[$Ktbl]
+	vadd.i32	$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#256-16	@ rewind
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	vadd.i32	$W1,$W1,@MSG[3]
+	vmov		$abcd,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
+	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
+	it		ne
+	bne		.Loop_v8
+
+	vst1.32		{$ABCD,$EFGH},[$ctx]
+
+	ret		@ bx lr
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+{   my  %opcode = (
+	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
+	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+}
+
+foreach (split($/,$code)) {
+
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-586.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-586.pl
@@ -0,0 +1,922 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# SHA512 block transform for x86. September 2007.
+#
+# May 2013.
+#
+# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
+#
+# Performance in clock cycles per processed byte (less is better):
+#
+#		gcc	icc	x86 asm	SIMD(*)	x86_64(**)
+# Pentium	100	97	61	-	-
+# PIII		75	77	56	-	-
+# P4		116	95	82	34.6	30.8
+# AMD K8	54	55	36	20.7	9.57
+# Core2		66	57	40	15.9	9.97
+# Westmere	70	-	38	12.2	9.58
+# Sandy Bridge	58	-	35	11.9	11.2
+# Ivy Bridge	50	-	33	11.5	8.17
+# Haswell	46	-	29	11.3	7.66
+# Skylake	40	-	26	13.3	7.25
+# Bulldozer	121	-	50	14.0	13.5
+# VIA Nano	91	-	52	33	14.7
+# Atom		126	-	68	48(***)	14.7
+# Silvermont	97	-	58	42(***)	17.5
+# Goldmont	80	-	48	19.5	12.0
+#
+# (*)	whichever best applicable.
+# (**)	x86_64 assembler performance is presented for reference
+#	purposes, the results are for integer-only code.
+# (***)	paddq is incredibly slow on Atom.
+#
+# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
+# performance improvement over compiler generated code reaches ~60%,
+# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
+# to 50%, but it's less important as they are expected to execute SSE2
+# code-path, which is commonly ~2-3x faster [than compiler generated
+# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
+# though it does not use 128-bit operations. The latter means that
+# SSE2-aware kernel is no longer required to execute the code. Another
+# difference is that new code optimizes amount of writes, but at the
+# cost of increased data cache "footprint" by 1/2KB.
+
+# The first two arguments should always be the flavour and output file path.
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../../perlasm");
+require "x86asm.pl";
+
+$output=$ARGV[1];
+open STDOUT,">$output";
+
+&asm_init($ARGV[0]);
+
+$sse2=1;
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$Tlo=&DWP(0,"esp");	$Thi=&DWP(4,"esp");
+$Alo=&DWP(8,"esp");	$Ahi=&DWP(8+4,"esp");
+$Blo=&DWP(16,"esp");	$Bhi=&DWP(16+4,"esp");
+$Clo=&DWP(24,"esp");	$Chi=&DWP(24+4,"esp");
+$Dlo=&DWP(32,"esp");	$Dhi=&DWP(32+4,"esp");
+$Elo=&DWP(40,"esp");	$Ehi=&DWP(40+4,"esp");
+$Flo=&DWP(48,"esp");	$Fhi=&DWP(48+4,"esp");
+$Glo=&DWP(56,"esp");	$Ghi=&DWP(56+4,"esp");
+$Hlo=&DWP(64,"esp");	$Hhi=&DWP(64+4,"esp");
+$K512="ebp";
+
+$Asse2=&QWP(0,"esp");
+$Bsse2=&QWP(8,"esp");
+$Csse2=&QWP(16,"esp");
+$Dsse2=&QWP(24,"esp");
+$Esse2=&QWP(32,"esp");
+$Fsse2=&QWP(40,"esp");
+$Gsse2=&QWP(48,"esp");
+$Hsse2=&QWP(56,"esp");
+
+$A="mm0";	# B-D and
+$E="mm4";	# F-H are commonly loaded to respectively mm1-mm3 and
+		# mm5-mm7, but it's done on on-demand basis...
+$BxC="mm2";	# ... except for B^C
+
+sub BODY_00_15_sse2 {
+    my $phase=shift;
+
+	#&movq	("mm5",$Fsse2);			# load f
+	#&movq	("mm6",$Gsse2);			# load g
+
+	&movq	("mm1",$E);			# %mm1 is sliding right
+	 &pxor	("mm5","mm6");			# f^=g
+	&psrlq	("mm1",14);
+	 &movq	($Esse2,$E);			# modulo-scheduled save e
+	 &pand	("mm5",$E);			# f&=e
+	&psllq	($E,23);			# $E is sliding left
+	 &movq	($A,"mm3")			if ($phase<2);
+	 &movq	(&QWP(8*9,"esp"),"mm7")		# save X[i]
+	&movq	("mm3","mm1");			# %mm3 is T1
+	 &psrlq	("mm1",4);
+	 &pxor	("mm5","mm6");			# Ch(e,f,g)
+	&pxor	("mm3",$E);
+	 &psllq	($E,23);
+	&pxor	("mm3","mm1");
+	 &movq	($Asse2,$A);			# modulo-scheduled save a
+	 &paddq	("mm7","mm5");			# X[i]+=Ch(e,f,g)
+	&pxor	("mm3",$E);
+	 &psrlq	("mm1",23);
+	 &paddq	("mm7",$Hsse2);			# X[i]+=h
+	&pxor	("mm3","mm1");
+	 &psllq	($E,4);
+	 &paddq	("mm7",QWP(0,$K512));		# X[i]+=K512[i]
+	&pxor	("mm3",$E);			# T1=Sigma1_512(e)
+
+	 &movq	($E,$Dsse2);			# e = load d, e in next round
+	&paddq	("mm3","mm7");			# T1+=X[i]
+	 &movq	("mm5",$A);			# %mm5 is sliding right
+	 &psrlq	("mm5",28);
+	&paddq	($E,"mm3");			# d += T1
+	 &movq	("mm6",$A);			# %mm6 is sliding left
+	 &movq	("mm7","mm5");
+	 &psllq	("mm6",25);
+	&movq	("mm1",$Bsse2);			# load b
+	 &psrlq	("mm5",6);
+	 &pxor	("mm7","mm6");
+	&sub	("esp",8);
+	 &psllq	("mm6",5);
+	 &pxor	("mm7","mm5");
+	&pxor	($A,"mm1");			# a^b, b^c in next round
+	 &psrlq	("mm5",5);
+	 &pxor	("mm7","mm6");
+	&pand	($BxC,$A);			# (b^c)&(a^b)
+	 &psllq	("mm6",6);
+	 &pxor	("mm7","mm5");
+	&pxor	($BxC,"mm1");			# [h=]Maj(a,b,c)
+	 &pxor	("mm6","mm7");			# Sigma0_512(a)
+	 &movq	("mm7",&QWP(8*(9+16-1),"esp"))	if ($phase!=0);	# pre-fetch
+	 &movq	("mm5",$Fsse2)			if ($phase==0);	# load f
+
+    if ($phase>1) {
+	&paddq	($BxC,"mm6");			# h+=Sigma0(a)
+	 &add	($K512,8);
+	#&paddq	($BxC,"mm3");			# h+=T1
+
+	($A,$BxC) = ($BxC,$A);			# rotate registers
+    } else {
+	&paddq	("mm3",$BxC);			# T1+=Maj(a,b,c)
+	 &movq	($BxC,$A);
+	 &add	($K512,8);
+	&paddq	("mm3","mm6");			# T1+=Sigma0(a)
+	 &movq	("mm6",$Gsse2)			if ($phase==0);	# load g
+	#&movq	($A,"mm3");			# h=T1
+    }
+}
+
+sub BODY_00_15_x86 {
+	#define Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	#	LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	#	HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	&mov	("ecx",$Elo);
+	&mov	("edx",$Ehi);
+	&mov	("esi","ecx");
+
+	&shr	("ecx",9);	# lo>>9
+	&mov	("edi","edx");
+	&shr	("edx",9);	# hi>>9
+	&mov	("ebx","ecx");
+	&shl	("esi",14);	# lo<<14
+	&mov	("eax","edx");
+	&shl	("edi",14);	# hi<<14
+	&xor	("ebx","esi");
+
+	&shr	("ecx",14-9);	# lo>>14
+	&xor	("eax","edi");
+	&shr	("edx",14-9);	# hi>>14
+	&xor	("eax","ecx");
+	&shl	("esi",18-14);	# lo<<18
+	&xor	("ebx","edx");
+	&shl	("edi",18-14);	# hi<<18
+	&xor	("ebx","esi");
+
+	&shr	("ecx",18-14);	# lo>>18
+	&xor	("eax","edi");
+	&shr	("edx",18-14);	# hi>>18
+	&xor	("eax","ecx");
+	&shl	("esi",23-18);	# lo<<23
+	&xor	("ebx","edx");
+	&shl	("edi",23-18);	# hi<<23
+	&xor	("eax","esi");
+	&xor	("ebx","edi");			# T1 = Sigma1(e)
+
+	&mov	("ecx",$Flo);
+	&mov	("edx",$Fhi);
+	&mov	("esi",$Glo);
+	&mov	("edi",$Ghi);
+	 &add	("eax",$Hlo);
+	 &adc	("ebx",$Hhi);			# T1 += h
+	&xor	("ecx","esi");
+	&xor	("edx","edi");
+	&and	("ecx",$Elo);
+	&and	("edx",$Ehi);
+	 &add	("eax",&DWP(8*(9+15)+0,"esp"));
+	 &adc	("ebx",&DWP(8*(9+15)+4,"esp"));	# T1 += X[0]
+	&xor	("ecx","esi");
+	&xor	("edx","edi");			# Ch(e,f,g) = (f^g)&e)^g
+
+	&mov	("esi",&DWP(0,$K512));
+	&mov	("edi",&DWP(4,$K512));		# K[i]
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += Ch(e,f,g)
+	&mov	("ecx",$Dlo);
+	&mov	("edx",$Dhi);
+	&add	("eax","esi");
+	&adc	("ebx","edi");			# T1 += K[i]
+	&mov	($Tlo,"eax");
+	&mov	($Thi,"ebx");			# put T1 away
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# d += T1
+
+	#define Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	#	LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	#	HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	&mov	("ecx",$Alo);
+	&mov	("edx",$Ahi);
+	&mov	($Dlo,"eax");
+	&mov	($Dhi,"ebx");
+	&mov	("esi","ecx");
+
+	&shr	("ecx",2);	# lo>>2
+	&mov	("edi","edx");
+	&shr	("edx",2);	# hi>>2
+	&mov	("ebx","ecx");
+	&shl	("esi",4);	# lo<<4
+	&mov	("eax","edx");
+	&shl	("edi",4);	# hi<<4
+	&xor	("ebx","esi");
+
+	&shr	("ecx",7-2);	# lo>>7
+	&xor	("eax","edi");
+	&shr	("edx",7-2);	# hi>>7
+	&xor	("ebx","ecx");
+	&shl	("esi",25-4);	# lo<<25
+	&xor	("eax","edx");
+	&shl	("edi",25-4);	# hi<<25
+	&xor	("eax","esi");
+
+	&shr	("ecx",28-7);	# lo>>28
+	&xor	("ebx","edi");
+	&shr	("edx",28-7);	# hi>>28
+	&xor	("eax","ecx");
+	&shl	("esi",30-25);	# lo<<30
+	&xor	("ebx","edx");
+	&shl	("edi",30-25);	# hi<<30
+	&xor	("eax","esi");
+	&xor	("ebx","edi");			# Sigma0(a)
+
+	&mov	("ecx",$Alo);
+	&mov	("edx",$Ahi);
+	&mov	("esi",$Blo);
+	&mov	("edi",$Bhi);
+	&add	("eax",$Tlo);
+	&adc	("ebx",$Thi);			# T1 = Sigma0(a)+T1
+	&or	("ecx","esi");
+	&or	("edx","edi");
+	&and	("ecx",$Clo);
+	&and	("edx",$Chi);
+	&and	("esi",$Alo);
+	&and	("edi",$Ahi);
+	&or	("ecx","esi");
+	&or	("edx","edi");			# Maj(a,b,c) = ((a|b)&c)|(a&b)
+
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += Maj(a,b,c)
+	&mov	($Tlo,"eax");
+	&mov	($Thi,"ebx");
+
+	&mov	(&LB("edx"),&BP(0,$K512));	# pre-fetch LSB of *K
+	&sub	("esp",8);
+	&lea	($K512,&DWP(8,$K512));		# K++
+}
+
+
+&function_begin("sha512_block_data_order");
+	&mov	("esi",wparam(0));	# ctx
+	&mov	("edi",wparam(1));	# inp
+	&mov	("eax",wparam(2));	# num
+	&mov	("ebx","esp");		# saved sp
+
+	&call	(&label("pic_point"));	# make it PIC!
+&set_label("pic_point");
+	&blindpop($K512);
+	&lea	($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
+
+	&sub	("esp",16);
+	&and	("esp",-64);
+
+	&shl	("eax",7);
+	&add	("eax","edi");
+	&mov	(&DWP(0,"esp"),"esi");	# ctx
+	&mov	(&DWP(4,"esp"),"edi");	# inp
+	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
+	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
+
+if ($sse2) {
+	&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
+	&mov	("ecx",&DWP(0,"edx"));
+	&test	("ecx",1<<26);
+	&jz	(&label("loop_x86"));
+
+	&mov	("edx",&DWP(4,"edx"));
+
+	# load ctx->h[0-7]
+	&movq	($A,&QWP(0,"esi"));
+	 &and	("ecx",1<<24);		# XMM registers availability
+	&movq	("mm1",&QWP(8,"esi"));
+	 &and	("edx",1<<9);		# SSSE3 bit
+	&movq	($BxC,&QWP(16,"esi"));
+	 &or	("ecx","edx");
+	&movq	("mm3",&QWP(24,"esi"));
+	&movq	($E,&QWP(32,"esi"));
+	&movq	("mm5",&QWP(40,"esi"));
+	&movq	("mm6",&QWP(48,"esi"));
+	&movq	("mm7",&QWP(56,"esi"));
+	&cmp	("ecx",1<<24|1<<9);
+	&je	(&label("SSSE3"));
+	&sub	("esp",8*10);
+	&jmp	(&label("loop_sse2"));
+
+&set_label("loop_sse2",16);
+	#&movq	($Asse2,$A);
+	&movq	($Bsse2,"mm1");
+	&movq	($Csse2,$BxC);
+	&movq	($Dsse2,"mm3");
+	#&movq	($Esse2,$E);
+	&movq	($Fsse2,"mm5");
+	&movq	($Gsse2,"mm6");
+	&pxor	($BxC,"mm1");			# magic
+	&movq	($Hsse2,"mm7");
+	&movq	("mm3",$A);			# magic
+
+	&mov	("eax",&DWP(0,"edi"));
+	&mov	("ebx",&DWP(4,"edi"));
+	&add	("edi",8);
+	&mov	("edx",15);			# counter
+	&bswap	("eax");
+	&bswap	("ebx");
+	&jmp	(&label("00_14_sse2"));
+
+&set_label("00_14_sse2",16);
+	&movd	("mm1","eax");
+	&mov	("eax",&DWP(0,"edi"));
+	&movd	("mm7","ebx");
+	&mov	("ebx",&DWP(4,"edi"));
+	&add	("edi",8);
+	&bswap	("eax");
+	&bswap	("ebx");
+	&punpckldq("mm7","mm1");
+
+	&BODY_00_15_sse2();
+
+	&dec	("edx");
+	&jnz	(&label("00_14_sse2"));
+
+	&movd	("mm1","eax");
+	&movd	("mm7","ebx");
+	&punpckldq("mm7","mm1");
+
+	&BODY_00_15_sse2(1);
+
+	&pxor	($A,$A);			# A is in %mm3
+	&mov	("edx",32);			# counter
+	&jmp	(&label("16_79_sse2"));
+
+&set_label("16_79_sse2",16);
+    for ($j=0;$j<2;$j++) {			# 2x unroll
+	#&movq	("mm7",&QWP(8*(9+16-1),"esp"));	# prefetched in BODY_00_15
+	&movq	("mm5",&QWP(8*(9+16-14),"esp"));
+	&movq	("mm1","mm7");
+	&psrlq	("mm7",1);
+	 &movq	("mm6","mm5");
+	 &psrlq	("mm5",6);
+	&psllq	("mm1",56);
+	 &paddq	($A,"mm3");			# from BODY_00_15
+	 &movq	("mm3","mm7");
+	&psrlq	("mm7",7-1);
+	 &pxor	("mm3","mm1");
+	 &psllq	("mm1",63-56);
+	&pxor	("mm3","mm7");
+	 &psrlq	("mm7",8-7);
+	&pxor	("mm3","mm1");
+	 &movq	("mm1","mm5");
+	 &psrlq	("mm5",19-6);
+	&pxor	("mm7","mm3");			# sigma0
+
+	 &psllq	("mm6",3);
+	 &pxor	("mm1","mm5");
+	&paddq	("mm7",&QWP(8*(9+16),"esp"));
+	 &pxor	("mm1","mm6");
+	 &psrlq	("mm5",61-19);
+	&paddq	("mm7",&QWP(8*(9+16-9),"esp"));
+	 &pxor	("mm1","mm5");
+	 &psllq	("mm6",45-3);
+	&movq	("mm5",$Fsse2);			# load f
+	 &pxor	("mm1","mm6");			# sigma1
+	&movq	("mm6",$Gsse2);			# load g
+
+	&paddq	("mm7","mm1");			# X[i]
+	#&movq	(&QWP(8*9,"esp"),"mm7");	# moved to BODY_00_15
+
+	&BODY_00_15_sse2(2);
+    }
+	&dec	("edx");
+	&jnz	(&label("16_79_sse2"));
+
+	#&movq	($A,$Asse2);
+	&paddq	($A,"mm3");			# from BODY_00_15
+	&movq	("mm1",$Bsse2);
+	#&movq	($BxC,$Csse2);
+	&movq	("mm3",$Dsse2);
+	#&movq	($E,$Esse2);
+	&movq	("mm5",$Fsse2);
+	&movq	("mm6",$Gsse2);
+	&movq	("mm7",$Hsse2);
+
+	&pxor	($BxC,"mm1");			# de-magic
+	&paddq	($A,&QWP(0,"esi"));
+	&paddq	("mm1",&QWP(8,"esi"));
+	&paddq	($BxC,&QWP(16,"esi"));
+	&paddq	("mm3",&QWP(24,"esi"));
+	&paddq	($E,&QWP(32,"esi"));
+	&paddq	("mm5",&QWP(40,"esi"));
+	&paddq	("mm6",&QWP(48,"esi"));
+	&paddq	("mm7",&QWP(56,"esi"));
+
+	&mov	("eax",8*80);
+	&movq	(&QWP(0,"esi"),$A);
+	&movq	(&QWP(8,"esi"),"mm1");
+	&movq	(&QWP(16,"esi"),$BxC);
+	&movq	(&QWP(24,"esi"),"mm3");
+	&movq	(&QWP(32,"esi"),$E);
+	&movq	(&QWP(40,"esi"),"mm5");
+	&movq	(&QWP(48,"esi"),"mm6");
+	&movq	(&QWP(56,"esi"),"mm7");
+
+	&lea	("esp",&DWP(0,"esp","eax"));	# destroy frame
+	&sub	($K512,"eax");			# rewind K
+
+	&cmp	("edi",&DWP(8*10+8,"esp"));	# are we done yet?
+	&jb	(&label("loop_sse2"));
+
+	&mov	("esp",&DWP(8*10+12,"esp"));	# restore sp
+	&emms	();
+&function_end_A();
+
+&set_label("SSSE3",32);
+{ my ($cnt,$frame)=("ecx","edx");
+  my @X=map("xmm$_",(0..7));
+  my $j;
+  my $i=0;
+
+	&lea	($frame,&DWP(-64,"esp"));
+	&sub	("esp",256);
+
+	# fixed stack frame layout
+	#
+	# +0	A B C D E F G H		# backing store
+	# +64	X[0]+K[i] .. X[15]+K[i]	# XMM->MM xfer area
+	# +192				# XMM off-load ring buffer
+	# +256				# saved parameters
+
+	&movdqa		(@X[1],&QWP(80*8,$K512));		# byte swap mask
+	&movdqu		(@X[0],&QWP(0,"edi"));
+	&pshufb		(@X[0],@X[1]);
+    for ($j=0;$j<8;$j++) {
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3])	if ($j>4); # off-load
+	&movdqa		(@X[3],&QWP(16*($j%8),$K512));
+	&movdqa		(@X[2],@X[1])				if ($j<7); # perpetuate byte swap mask
+	&movdqu		(@X[1],&QWP(16*($j+1),"edi"))		if ($j<7); # next input
+	&movdqa		(@X[1],&QWP(16*(($j+1)%4),$frame))	if ($j==7);# restore @X[0]
+	&paddq		(@X[3],@X[0]);
+	&pshufb		(@X[1],@X[2])				if ($j<7);
+	&movdqa		(&QWP(16*($j%8)-128,$frame),@X[3]);	# xfer X[i]+K[i]
+
+	push(@X,shift(@X));					# rotate(@X)
+    }
+	#&jmp		(&label("loop_ssse3"));
+	&nop		();
+
+&set_label("loop_ssse3",32);
+	&movdqa		(@X[2],&QWP(16*(($j+1)%4),$frame));	# pre-restore @X[1]
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3]);	# off-load @X[3]
+	&lea		($K512,&DWP(16*8,$K512));
+
+	#&movq	($Asse2,$A);			# off-load A-H
+	&movq	($Bsse2,"mm1");
+	 &mov	("ebx","edi");
+	&movq	($Csse2,$BxC);
+	 &lea	("edi",&DWP(128,"edi"));	# advance input
+	&movq	($Dsse2,"mm3");
+	 &cmp	("edi","eax");
+	#&movq	($Esse2,$E);
+	&movq	($Fsse2,"mm5");
+	 &cmovb	("ebx","edi");
+	&movq	($Gsse2,"mm6");
+	 &mov	("ecx",4);			# loop counter
+	&pxor	($BxC,"mm1");			# magic
+	&movq	($Hsse2,"mm7");
+	&pxor	("mm3","mm3");			# magic
+
+	&jmp		(&label("00_47_ssse3"));
+
+sub BODY_00_15_ssse3 {		# "phase-less" copy of BODY_00_15_sse2
+	(
+	'&movq	("mm1",$E)',				# %mm1 is sliding right
+	'&movq	("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
+	 '&pxor	("mm5","mm6")',				# f^=g
+	'&psrlq	("mm1",14)',
+	 '&movq	(&QWP(8*($i+4)%64,"esp"),$E)',		# modulo-scheduled save e
+	 '&pand	("mm5",$E)',				# f&=e
+	'&psllq	($E,23)',				# $E is sliding left
+	'&paddq	($A,"mm3")',				# [h+=Maj(a,b,c)]
+	'&movq	("mm3","mm1")',				# %mm3 is T1
+	 '&psrlq("mm1",4)',
+	 '&pxor	("mm5","mm6")',				# Ch(e,f,g)
+	'&pxor	("mm3",$E)',
+	 '&psllq($E,23)',
+	'&pxor	("mm3","mm1")',
+	 '&movq	(&QWP(8*$i%64,"esp"),$A)',		# modulo-scheduled save a
+	 '&paddq("mm7","mm5")',				# X[i]+=Ch(e,f,g)
+	'&pxor	("mm3",$E)',
+	 '&psrlq("mm1",23)',
+	 '&paddq("mm7",&QWP(8*($i+7)%64,"esp"))',	# X[i]+=h
+	'&pxor	("mm3","mm1")',
+	 '&psllq($E,4)',
+	'&pxor	("mm3",$E)',				# T1=Sigma1_512(e)
+
+	 '&movq	($E,&QWP(8*($i+3)%64,"esp"))',		# e = load d, e in next round
+	'&paddq	("mm3","mm7")',				# T1+=X[i]
+	 '&movq	("mm5",$A)',				# %mm5 is sliding right
+	 '&psrlq("mm5",28)',
+	'&paddq	($E,"mm3")',				# d += T1
+	 '&movq	("mm6",$A)',				# %mm6 is sliding left
+	 '&movq	("mm7","mm5")',
+	 '&psllq("mm6",25)',
+	'&movq	("mm1",&QWP(8*($i+1)%64,"esp"))',	# load b
+	 '&psrlq("mm5",6)',
+	 '&pxor	("mm7","mm6")',
+	 '&psllq("mm6",5)',
+	 '&pxor	("mm7","mm5")',
+	'&pxor	($A,"mm1")',				# a^b, b^c in next round
+	 '&psrlq("mm5",5)',
+	 '&pxor	("mm7","mm6")',
+	'&pand	($BxC,$A)',				# (b^c)&(a^b)
+	 '&psllq("mm6",6)',
+	 '&pxor	("mm7","mm5")',
+	'&pxor	($BxC,"mm1")',				# [h=]Maj(a,b,c)
+	 '&pxor	("mm6","mm7")',				# Sigma0_512(a)
+	 '&movq	("mm5",&QWP(8*($i+5-1)%64,"esp"))',	# pre-load f
+	'&paddq	($BxC,"mm6")',				# h+=Sigma0(a)
+	 '&movq	("mm6",&QWP(8*($i+6-1)%64,"esp"))',	# pre-load g
+
+	'($A,$BxC) = ($BxC,$A); $i--;'
+	);
+}
+
+&set_label("00_47_ssse3",32);
+
+    for(;$j<16;$j++) {
+	my ($t0,$t2,$t1)=@X[2..4];
+	my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+	&movdqa		($t2,@X[5]);
+	&movdqa		(@X[1],$t0);			# restore @X[1]
+	&palignr	($t0,@X[0],8);			# X[1..2]
+	&movdqa		(&QWP(16*($j%4),$frame),@X[4]);	# off-load @X[4]
+	 &palignr	($t2,@X[4],8);			# X[9..10]
+
+	&movdqa		($t1,$t0);
+	&psrlq		($t0,7);
+	 &paddq		(@X[0],$t2);			# X[0..1] += X[9..10]
+	&movdqa		($t2,$t1);
+	&psrlq		($t1,1);
+	&psllq		($t2,64-8);
+	&pxor		($t0,$t1);
+	&psrlq		($t1,8-1);
+	&pxor		($t0,$t2);
+	&psllq		($t2,8-1);
+	&pxor		($t0,$t1);
+	 &movdqa	($t1,@X[7]);
+	&pxor		($t0,$t2);			# sigma0(X[1..2])
+	 &movdqa	($t2,@X[7]);
+	 &psrlq		($t1,6);
+	&paddq		(@X[0],$t0);			# X[0..1] += sigma0(X[1..2])
+
+	&movdqa		($t0,@X[7]);
+	&psrlq		($t2,19);
+	&psllq		($t0,64-61);
+	&pxor		($t1,$t2);
+	&psrlq		($t2,61-19);
+	&pxor		($t1,$t0);
+	&psllq		($t0,61-19);
+	&pxor		($t1,$t2);
+	&movdqa		($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
+	&pxor		($t1,$t0);			# sigma0(X[1..2])
+	&movdqa		($t0,&QWP(16*($j%8),$K512));
+	 eval(shift(@insns));
+	&paddq		(@X[0],$t1);			# X[0..1] += sigma0(X[14..15])
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddq		($t0,@X[0]);
+	 foreach(@insns) { eval; }
+	&movdqa		(&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
+
+	push(@X,shift(@X));				# rotate(@X)
+    }
+	&lea		($K512,&DWP(16*8,$K512));
+	&dec		("ecx");
+	&jnz		(&label("00_47_ssse3"));
+
+	&movdqa		(@X[1],&QWP(0,$K512));		# byte swap mask
+	&lea		($K512,&DWP(-80*8,$K512));	# rewind
+	&movdqu		(@X[0],&QWP(0,"ebx"));
+	&pshufb		(@X[0],@X[1]);
+
+    for ($j=0;$j<8;$j++) {	# load next or same block
+	my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
+
+	&movdqa		(&QWP(16*(($j-1)%4),$frame),@X[3])	if ($j>4); # off-load
+	&movdqa		(@X[3],&QWP(16*($j%8),$K512));
+	&movdqa		(@X[2],@X[1])				if ($j<7); # perpetuate byte swap mask
+	&movdqu		(@X[1],&QWP(16*($j+1),"ebx"))		if ($j<7); # next input
+	&movdqa		(@X[1],&QWP(16*(($j+1)%4),$frame))	if ($j==7);# restore @X[0]
+	&paddq		(@X[3],@X[0]);
+	&pshufb		(@X[1],@X[2])				if ($j<7);
+	 foreach(@insns) { eval; }
+	&movdqa		(&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
+
+	push(@X,shift(@X));				# rotate(@X)
+    }
+
+	#&movq	($A,$Asse2);			# load A-H
+	&movq	("mm1",$Bsse2);
+	&paddq	($A,"mm3");			# from BODY_00_15
+	#&movq	($BxC,$Csse2);
+	&movq	("mm3",$Dsse2);
+	#&movq	($E,$Esse2);
+	#&movq	("mm5",$Fsse2);
+	#&movq	("mm6",$Gsse2);
+	&movq	("mm7",$Hsse2);
+
+	&pxor	($BxC,"mm1");			# de-magic
+	&paddq	($A,&QWP(0,"esi"));
+	&paddq	("mm1",&QWP(8,"esi"));
+	&paddq	($BxC,&QWP(16,"esi"));
+	&paddq	("mm3",&QWP(24,"esi"));
+	&paddq	($E,&QWP(32,"esi"));
+	&paddq	("mm5",&QWP(40,"esi"));
+	&paddq	("mm6",&QWP(48,"esi"));
+	&paddq	("mm7",&QWP(56,"esi"));
+
+	&movq	(&QWP(0,"esi"),$A);
+	&movq	(&QWP(8,"esi"),"mm1");
+	&movq	(&QWP(16,"esi"),$BxC);
+	&movq	(&QWP(24,"esi"),"mm3");
+	&movq	(&QWP(32,"esi"),$E);
+	&movq	(&QWP(40,"esi"),"mm5");
+	&movq	(&QWP(48,"esi"),"mm6");
+	&movq	(&QWP(56,"esi"),"mm7");
+
+    	&cmp	("edi","eax")			# are we done yet?
+	&jb	(&label("loop_ssse3"));
+
+	&mov	("esp",&DWP(64+12,$frame));	# restore sp
+	&emms	();
+}
+&function_end_A();
+}
+&set_label("loop_x86",16);
+    # copy input block to stack reversing byte and qword order
+    for ($i=0;$i<8;$i++) {
+	&mov	("eax",&DWP($i*16+0,"edi"));
+	&mov	("ebx",&DWP($i*16+4,"edi"));
+	&mov	("ecx",&DWP($i*16+8,"edi"));
+	&mov	("edx",&DWP($i*16+12,"edi"));
+	&bswap	("eax");
+	&bswap	("ebx");
+	&bswap	("ecx");
+	&bswap	("edx");
+	&push	("eax");
+	&push	("ebx");
+	&push	("ecx");
+	&push	("edx");
+    }
+	&add	("edi",128);
+	&sub	("esp",9*8);		# place for T,A,B,C,D,E,F,G,H
+	&mov	(&DWP(8*(9+16)+4,"esp"),"edi");
+
+	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
+	&lea	("edi",&DWP(8,"esp"));
+	&mov	("ecx",16);
+	&data_word(0xA5F3F689);		# rep movsd
+
+&set_label("00_15_x86",16);
+	&BODY_00_15_x86();
+
+	&cmp	(&LB("edx"),0x94);
+	&jne	(&label("00_15_x86"));
+
+&set_label("16_79_x86",16);
+	#define sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	#	LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	#	HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	&mov	("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16-1)+4,"esp"));
+	&mov	("esi","ecx");
+
+	&shr	("ecx",1);	# lo>>1
+	&mov	("edi","edx");
+	&shr	("edx",1);	# hi>>1
+	&mov	("eax","ecx");
+	&shl	("esi",24);	# lo<<24
+	&mov	("ebx","edx");
+	&shl	("edi",24);	# hi<<24
+	&xor	("ebx","esi");
+
+	&shr	("ecx",7-1);	# lo>>7
+	&xor	("eax","edi");
+	&shr	("edx",7-1);	# hi>>7
+	&xor	("eax","ecx");
+	&shl	("esi",31-24);	# lo<<31
+	&xor	("ebx","edx");
+	&shl	("edi",25-24);	# hi<<25
+	&xor	("ebx","esi");
+
+	&shr	("ecx",8-7);	# lo>>8
+	&xor	("eax","edi");
+	&shr	("edx",8-7);	# hi>>8
+	&xor	("eax","ecx");
+	&shl	("edi",31-25);	# hi<<31
+	&xor	("ebx","edx");
+	&xor	("eax","edi");			# T1 = sigma0(X[-15])
+
+	&mov	(&DWP(0,"esp"),"eax");
+	&mov	(&DWP(4,"esp"),"ebx");		# put T1 away
+
+	#define sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	#	LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	#	HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	&mov	("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16-14)+4,"esp"));
+	&mov	("esi","ecx");
+
+	&shr	("ecx",6);	# lo>>6
+	&mov	("edi","edx");
+	&shr	("edx",6);	# hi>>6
+	&mov	("eax","ecx");
+	&shl	("esi",3);	# lo<<3
+	&mov	("ebx","edx");
+	&shl	("edi",3);	# hi<<3
+	&xor	("eax","esi");
+
+	&shr	("ecx",19-6);	# lo>>19
+	&xor	("ebx","edi");
+	&shr	("edx",19-6);	# hi>>19
+	&xor	("eax","ecx");
+	&shl	("esi",13-3);	# lo<<13
+	&xor	("ebx","edx");
+	&shl	("edi",13-3);	# hi<<13
+	&xor	("ebx","esi");
+
+	&shr	("ecx",29-19);	# lo>>29
+	&xor	("eax","edi");
+	&shr	("edx",29-19);	# hi>>29
+	&xor	("ebx","ecx");
+	&shl	("edi",26-13);	# hi<<26
+	&xor	("eax","edx");
+	&xor	("eax","edi");			# sigma1(X[-2])
+
+	&mov	("ecx",&DWP(8*(9+15+16)+0,"esp"));
+	&mov	("edx",&DWP(8*(9+15+16)+4,"esp"));
+	&add	("eax",&DWP(0,"esp"));
+	&adc	("ebx",&DWP(4,"esp"));		# T1 = sigma1(X[-2])+T1
+	&mov	("esi",&DWP(8*(9+15+16-9)+0,"esp"));
+	&mov	("edi",&DWP(8*(9+15+16-9)+4,"esp"));
+	&add	("eax","ecx");
+	&adc	("ebx","edx");			# T1 += X[-16]
+	&add	("eax","esi");
+	&adc	("ebx","edi");			# T1 += X[-7]
+	&mov	(&DWP(8*(9+15)+0,"esp"),"eax");
+	&mov	(&DWP(8*(9+15)+4,"esp"),"ebx");	# save X[0]
+
+	&BODY_00_15_x86();
+
+	&cmp	(&LB("edx"),0x17);
+	&jne	(&label("16_79_x86"));
+
+	&mov	("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
+	&mov	("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
+    for($i=0;$i<4;$i++) {
+	&mov	("eax",&DWP($i*16+0,"esi"));
+	&mov	("ebx",&DWP($i*16+4,"esi"));
+	&mov	("ecx",&DWP($i*16+8,"esi"));
+	&mov	("edx",&DWP($i*16+12,"esi"));
+	&add	("eax",&DWP(8+($i*16)+0,"esp"));
+	&adc	("ebx",&DWP(8+($i*16)+4,"esp"));
+	&mov	(&DWP($i*16+0,"esi"),"eax");
+	&mov	(&DWP($i*16+4,"esi"),"ebx");
+	&add	("ecx",&DWP(8+($i*16)+8,"esp"));
+	&adc	("edx",&DWP(8+($i*16)+12,"esp"));
+	&mov	(&DWP($i*16+8,"esi"),"ecx");
+	&mov	(&DWP($i*16+12,"esi"),"edx");
+    }
+	&add	("esp",8*(9+16+80));		# destroy frame
+	&sub	($K512,8*80);			# rewind K
+
+	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
+	&jb	(&label("loop_x86"));
+
+	&mov	("esp",&DWP(12,"esp"));		# restore sp
+&function_end_A();
+
+&set_label("K512",64);	# Yes! I keep it in the code segment!
+	&data_word(0xd728ae22,0x428a2f98);	# u64
+	&data_word(0x23ef65cd,0x71374491);	# u64
+	&data_word(0xec4d3b2f,0xb5c0fbcf);	# u64
+	&data_word(0x8189dbbc,0xe9b5dba5);	# u64
+	&data_word(0xf348b538,0x3956c25b);	# u64
+	&data_word(0xb605d019,0x59f111f1);	# u64
+	&data_word(0xaf194f9b,0x923f82a4);	# u64
+	&data_word(0xda6d8118,0xab1c5ed5);	# u64
+	&data_word(0xa3030242,0xd807aa98);	# u64
+	&data_word(0x45706fbe,0x12835b01);	# u64
+	&data_word(0x4ee4b28c,0x243185be);	# u64
+	&data_word(0xd5ffb4e2,0x550c7dc3);	# u64
+	&data_word(0xf27b896f,0x72be5d74);	# u64
+	&data_word(0x3b1696b1,0x80deb1fe);	# u64
+	&data_word(0x25c71235,0x9bdc06a7);	# u64
+	&data_word(0xcf692694,0xc19bf174);	# u64
+	&data_word(0x9ef14ad2,0xe49b69c1);	# u64
+	&data_word(0x384f25e3,0xefbe4786);	# u64
+	&data_word(0x8b8cd5b5,0x0fc19dc6);	# u64
+	&data_word(0x77ac9c65,0x240ca1cc);	# u64
+	&data_word(0x592b0275,0x2de92c6f);	# u64
+	&data_word(0x6ea6e483,0x4a7484aa);	# u64
+	&data_word(0xbd41fbd4,0x5cb0a9dc);	# u64
+	&data_word(0x831153b5,0x76f988da);	# u64
+	&data_word(0xee66dfab,0x983e5152);	# u64
+	&data_word(0x2db43210,0xa831c66d);	# u64
+	&data_word(0x98fb213f,0xb00327c8);	# u64
+	&data_word(0xbeef0ee4,0xbf597fc7);	# u64
+	&data_word(0x3da88fc2,0xc6e00bf3);	# u64
+	&data_word(0x930aa725,0xd5a79147);	# u64
+	&data_word(0xe003826f,0x06ca6351);	# u64
+	&data_word(0x0a0e6e70,0x14292967);	# u64
+	&data_word(0x46d22ffc,0x27b70a85);	# u64
+	&data_word(0x5c26c926,0x2e1b2138);	# u64
+	&data_word(0x5ac42aed,0x4d2c6dfc);	# u64
+	&data_word(0x9d95b3df,0x53380d13);	# u64
+	&data_word(0x8baf63de,0x650a7354);	# u64
+	&data_word(0x3c77b2a8,0x766a0abb);	# u64
+	&data_word(0x47edaee6,0x81c2c92e);	# u64
+	&data_word(0x1482353b,0x92722c85);	# u64
+	&data_word(0x4cf10364,0xa2bfe8a1);	# u64
+	&data_word(0xbc423001,0xa81a664b);	# u64
+	&data_word(0xd0f89791,0xc24b8b70);	# u64
+	&data_word(0x0654be30,0xc76c51a3);	# u64
+	&data_word(0xd6ef5218,0xd192e819);	# u64
+	&data_word(0x5565a910,0xd6990624);	# u64
+	&data_word(0x5771202a,0xf40e3585);	# u64
+	&data_word(0x32bbd1b8,0x106aa070);	# u64
+	&data_word(0xb8d2d0c8,0x19a4c116);	# u64
+	&data_word(0x5141ab53,0x1e376c08);	# u64
+	&data_word(0xdf8eeb99,0x2748774c);	# u64
+	&data_word(0xe19b48a8,0x34b0bcb5);	# u64
+	&data_word(0xc5c95a63,0x391c0cb3);	# u64
+	&data_word(0xe3418acb,0x4ed8aa4a);	# u64
+	&data_word(0x7763e373,0x5b9cca4f);	# u64
+	&data_word(0xd6b2b8a3,0x682e6ff3);	# u64
+	&data_word(0x5defb2fc,0x748f82ee);	# u64
+	&data_word(0x43172f60,0x78a5636f);	# u64
+	&data_word(0xa1f0ab72,0x84c87814);	# u64
+	&data_word(0x1a6439ec,0x8cc70208);	# u64
+	&data_word(0x23631e28,0x90befffa);	# u64
+	&data_word(0xde82bde9,0xa4506ceb);	# u64
+	&data_word(0xb2c67915,0xbef9a3f7);	# u64
+	&data_word(0xe372532b,0xc67178f2);	# u64
+	&data_word(0xea26619c,0xca273ece);	# u64
+	&data_word(0x21c0c207,0xd186b8c7);	# u64
+	&data_word(0xcde0eb1e,0xeada7dd6);	# u64
+	&data_word(0xee6ed178,0xf57d4f7f);	# u64
+	&data_word(0x72176fba,0x06f067aa);	# u64
+	&data_word(0xa2c898a6,0x0a637dc5);	# u64
+	&data_word(0xbef90dae,0x113f9804);	# u64
+	&data_word(0x131c471b,0x1b710b35);	# u64
+	&data_word(0x23047d84,0x28db77f5);	# u64
+	&data_word(0x40c72493,0x32caab7b);	# u64
+	&data_word(0x15c9bebc,0x3c9ebe0a);	# u64
+	&data_word(0x9c100d4c,0x431d67c4);	# u64
+	&data_word(0xcb3e42b6,0x4cc5d4be);	# u64
+	&data_word(0xfc657e2a,0x597f299c);	# u64
+	&data_word(0x3ad6faec,0x5fcb6fab);	# u64
+	&data_word(0x4a475817,0x6c44198c);	# u64
+
+	&data_word(0x04050607,0x00010203);	# byte swap
+	&data_word(0x0c0d0e0f,0x08090a0b);	# mask
+&function_end_B("sha512_block_data_order");
+&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-armv4.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-armv4.pl
@@ -0,0 +1,641 @@
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch  armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+
+.global	sha512_block_data_order_nohw
+.type	sha512_block_data_order_nohw,%function
+sha512_block_data_order_nohw:
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	adr	$Ktbl,K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	adr	$Ktbl,K512
+	VFP_ABI_PUSH
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-armv8.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-armv8.pl
@@ -0,0 +1,583 @@
+#! /usr/bin/env perl
+# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# Denver	2.01		10.5 (+26%)	6.70 (+8%)
+# X-Gene			20.0 (+100%)	12.8 (+300%(***))
+# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+# Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+#
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significantly faster
+#	and the gap is only 40-90%.
+
+if ($#ARGV < 1) { die "Not enough arguments provided.
+  Two arguments are necessary: the flavour and the output file path."; }
+
+$flavour = shift;
+$output = shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+    *STDOUT=*OUT;
+} else {
+    open OUT,">$output";
+    *STDOUT=*OUT;
+}
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order_nohw";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__AARCH64EB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adrp	$Ktbl,:pg_hi21:.LK$BITS
+	add	$Ktbl,$Ktbl,:lo12:.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	$func,.-$func
+
+.section .rodata
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.text
+#ifndef	__KERNEL__
+.globl	sha256_block_data_order_hw
+.type	sha256_block_data_order_hw,%function
+.align	6
+sha256_block_data_order_hw:
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern        BORINGSSL_function_hit
+	adrp	x9,:pg_hi21:BORINGSSL_function_hit
+	add     x9, x9, :lo12:BORINGSSL_function_hit
+	mov     w10, #1
+	strb    w10, [x9,#6] // kFlag_sha256_hw
+#endif
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adrp		$Ktbl,:pg_hi21:.LK256
+	add		$Ktbl,$Ktbl,:lo12:.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
+#endif
+___
+}
+
+if ($SZ==8) {
+my $Ktbl="x3";
+
+my @H = map("v$_.16b",(0..4));
+my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
+my @MSG=map("v$_.16b",(16..23));
+my ($W0,$W1)=("v24.2d","v25.2d");
+my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
+
+$code.=<<___;
+.text
+#ifndef	__KERNEL__
+.globl	sha512_block_data_order_hw
+.type	sha512_block_data_order_hw,%function
+.align	6
+sha512_block_data_order_hw:
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern        BORINGSSL_function_hit
+	adrp	x9,:pg_hi21:BORINGSSL_function_hit
+	add     x9, x9, :lo12:BORINGSSL_function_hit
+	mov     w10, #1
+	strb    w10, [x9,#8] // kFlag_sha512_hw
+#endif
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64	// load input
+	ld1		{@MSG[4]-@MSG[7]},[$inp],#64
+
+	ld1.64		{@H[0]-@H[3]},[$ctx]		// load context
+	adrp		$Ktbl,:pg_hi21:.LK512
+	add		$Ktbl,$Ktbl,:lo12:.LK512
+
+	rev64		@MSG[0],@MSG[0]
+	rev64		@MSG[1],@MSG[1]
+	rev64		@MSG[2],@MSG[2]
+	rev64		@MSG[3],@MSG[3]
+	rev64		@MSG[4],@MSG[4]
+	rev64		@MSG[5],@MSG[5]
+	rev64		@MSG[6],@MSG[6]
+	rev64		@MSG[7],@MSG[7]
+	b		.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1.64		{$W0},[$Ktbl],#16
+	subs		$num,$num,#1
+	sub		x4,$inp,#128
+	orr		$AB,@H[0],@H[0]			// offload
+	orr		$CD,@H[1],@H[1]
+	orr		$EF,@H[2],@H[2]
+	orr		$GH,@H[3],@H[3]
+	csel		$inp,$inp,x4,ne			// conditional rewind
+___
+for($i=0;$i<32;$i++) {
+$code.=<<___;
+	add.i64		$W0,$W0,@MSG[0]
+	ld1.64		{$W1},[$Ktbl],#16
+	ext		$W0,$W0,$W0,#8
+	ext		$fg,@H[2],@H[3],#8
+	ext		$de,@H[1],@H[2],#8
+	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
+	 sha512su0	@MSG[0],@MSG[1]
+	 ext		$m9_10,@MSG[4],@MSG[5],#8
+	sha512h		@H[3],$fg,$de
+	 sha512su1	@MSG[0],@MSG[7],$m9_10
+	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
+	sha512h2	@H[3],$H[1],@H[0]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+for(;$i<40;$i++) {
+$code.=<<___	if ($i<39);
+	ld1.64		{$W1},[$Ktbl],#16
+___
+$code.=<<___	if ($i==39);
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ	// rewind
+___
+$code.=<<___;
+	add.i64		$W0,$W0,@MSG[0]
+	 ld1		{@MSG[0]},[$inp],#16		// load next input
+	ext		$W0,$W0,$W0,#8
+	ext		$fg,@H[2],@H[3],#8
+	ext		$de,@H[1],@H[2],#8
+	add.i64		@H[3],@H[3],$W0			// "T1 + H + K512[i]"
+	sha512h		@H[3],$fg,$de
+	 rev64		@MSG[0],@MSG[0]
+	add.i64		@H[4],@H[1],@H[3]		// "D + T1"
+	sha512h2	@H[3],$H[1],@H[0]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+	@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
+}
+$code.=<<___;
+	add.i64		@H[0],@H[0],$AB			// accumulate
+	add.i64		@H[1],@H[1],$CD
+	add.i64		@H[2],@H[2],$EF
+	add.i64		@H[3],@H[3],$GH
+
+	cbnz		$num,.Loop_hw
+
+	st1.64		{@H[0]-@H[3]},[$ctx]		// store context
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha512_block_data_order_hw,.-sha512_block_data_order_hw
+#endif
+___
+}
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+{   my  %opcode = (
+	"sha512h"	=> 0xce608000,	"sha512h2"	=> 0xce608400,
+	"sha512su0"	=> 0xcec08000,	"sha512su1"	=> 0xce608800	);
+
+    sub unsha512 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge	or
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
+
+	s/\.[ui]?8(\s)/$1/;
+	s/\.\w?64\b//		and s/\.16b/\.2d/g	or
+	s/\.\w?32\b//		and s/\.16b/\.4s/g;
+	m/\bext\b/		and s/\.2d/\.16b/g	or
+	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
+
+	print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/internal.h
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/internal.h
@@ -0,0 +1,634 @@
+// Copyright (c) 2018, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#ifndef OPENSSL_HEADER_SHA_INTERNAL_H
+#define OPENSSL_HEADER_SHA_INTERNAL_H
+
+#include <openssl/base.h>
+
+#include <openssl/hmac.h>
+
+#include "../../internal.h"
+#include "../cpucap/internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Internal SHA2 constants
+
+// SHA*_CHAINING_LENGTH is the chaining length in bytes of SHA-*
+// It corresponds to the length in bytes of the h part of the state
+#define SHA1_CHAINING_LENGTH 20
+#define SHA224_CHAINING_LENGTH 32
+#define SHA256_CHAINING_LENGTH 32
+#define SHA384_CHAINING_LENGTH 64
+#define SHA512_CHAINING_LENGTH 64
+#define SHA512_224_CHAINING_LENGTH 64
+#define SHA512_256_CHAINING_LENGTH 64
+
+
+// SHA3 constants, from NIST FIPS202.
+// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+#define KECCAK1600_ROWS 5
+#define KECCAK1600_WIDTH 1600
+
+#define SHA3_224_CAPACITY_BYTES 56
+#define SHA3_224_CBLOCK SHA3_BLOCKSIZE(SHA3_224_DIGEST_BITLENGTH)
+#define SHA3_224_DIGEST_BITLENGTH 224
+#define SHA3_224_DIGEST_LENGTH 28
+
+#define SHA3_256_CAPACITY_BYTES 64
+#define SHA3_256_CBLOCK SHA3_BLOCKSIZE(SHA3_256_DIGEST_BITLENGTH)
+#define SHA3_256_DIGEST_BITLENGTH 256
+#define SHA3_256_DIGEST_LENGTH 32
+
+#define SHA3_384_CAPACITY_BYTES 96
+#define SHA3_384_CBLOCK SHA3_BLOCKSIZE(SHA3_384_DIGEST_BITLENGTH)
+#define SHA3_384_DIGEST_BITLENGTH 384
+#define SHA3_384_DIGEST_LENGTH 48
+
+#define SHA3_512_CAPACITY_BYTES 128
+#define SHA3_512_CBLOCK SHA3_BLOCKSIZE(SHA3_512_DIGEST_BITLENGTH)
+#define SHA3_512_DIGEST_BITLENGTH 512
+#define SHA3_512_DIGEST_LENGTH 64
+
+#define SHA3_BLOCKSIZE(bitlen) (KECCAK1600_WIDTH - bitlen * 2) / 8
+#define SHA3_PAD_CHAR 0x06
+
+// SHAKE constants, from NIST FIPS202.
+// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+#define SHAKE_PAD_CHAR 0x1F
+#define SHAKE128_BLOCKSIZE ((KECCAK1600_WIDTH - 128 * 2) / 8)
+#define SHAKE256_BLOCKSIZE ((KECCAK1600_WIDTH - 256 * 2) / 8)
+#define XOF_BLOCKBYTES SHAKE128_BLOCKSIZE
+
+// SHAKE128 has the maximum block size among the SHA3/SHAKE algorithms.
+#define SHA3_MAX_BLOCKSIZE SHAKE128_BLOCKSIZE
+
+// Define state flag values for Keccak-based functions
+#define KECCAK1600_STATE_ABSORB     0 
+// KECCAK1600_STATE_SQUEEZE is set when |SHAKE_Squeeze| is called.
+// It remains set while |SHAKE_Squeeze| is called repeatedly to output 
+// chunks of the XOF output.
+#define KECCAK1600_STATE_SQUEEZE    1  
+// KECCAK1600_STATE_FINAL is set once |SHAKE_Final| is called 
+// so that |SHAKE_Squeeze| cannot be called anymore.
+#define KECCAK1600_STATE_FINAL      2 
+
+typedef struct keccak_ctx_st KECCAK1600_CTX;
+
+// The data buffer should have at least the maximum number of
+// block size bytes to fit any SHA3/SHAKE block length.
+struct keccak_ctx_st {
+  uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS];
+  size_t block_size;                               // cached ctx->digest->block_size
+  size_t md_size;                                  // output length, variable in XOF (SHAKE)
+  size_t buf_load;                                 // used bytes in below buffer
+  uint8_t buf[SHA3_MAX_BLOCKSIZE];                 // should have at least the max data block size bytes
+  uint8_t pad;                                     // padding character
+  uint8_t state;                                   // denotes the keccak phase (absorb, squeeze, final)
+};
+
+// To avoid externalizing KECCAK1600_CTX, we hard-code the context size in
+// hmac.h's |md_ctx_union| and use a compile time check here to make sure
+// |KECCAK1600_CTX|'s size never exceeds that of |md_ctx_union|. This means
+// that whenever a new field is added to |keccak_ctx_st| we must also update
+// the hard-coded size of |sha3| in hmac.h's |md_ctx_union| with the new
+// value given by |sizeof(keccaak_ctx_st)|.
+OPENSSL_STATIC_ASSERT(sizeof(KECCAK1600_CTX) <= sizeof(union md_ctx_union),
+                      hmac_md_ctx_union_sha3_size_needs_update)
+
+// KECCAK1600 x4 batched context structure
+typedef struct keccak_ctx_st_x4 KECCAK1600_CTX_x4;
+
+struct keccak_ctx_st_x4 {
+  uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS];
+};
+
+// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
+// defined in assembly.
+
+#if defined(OPENSSL_PPC64LE)
+#define SHA1_ALTIVEC
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data,
+                             size_t num_blocks);
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA1_ASM_NEON
+void sha1_block_data_order_neon(uint32_t state[5], const uint8_t *data,
+                                size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA256_ASM_NEON
+void sha256_block_data_order_neon(uint32_t state[8], const uint8_t *data,
+                                  size_t num);
+
+// Armv8.2 SHA-512 instructions are not available in 32-bit.
+#define SHA512_ASM_NEON
+void sha512_block_data_order_neon(uint64_t state[8], const uint8_t *data,
+                                  size_t num);
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA1_capable();
+}
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA256_capable();
+}
+
+#define SHA512_ASM_HW
+OPENSSL_INLINE int sha512_hw_capable(void) {
+  return CRYPTO_is_ARMv8_SHA512_capable();
+}
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+
+#define SHA1_ASM_SSSE3
+OPENSSL_INLINE int sha1_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha1_block_data_order_ssse3(uint32_t state[5], const uint8_t *data,
+                                 size_t num);
+
+#define SHA1_ASM_AVX
+OPENSSL_INLINE int sha1_avx_capable(void) {
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  //
+  // TODO(davidben): Should we enable SHAEXT on 32-bit x86?
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
+         CRYPTO_is_FXSR_capable();
+}
+void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
+                               size_t num);
+
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
+                                   size_t num);
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  //
+  // TODO(davidben): Should we enable SHAEXT on 32-bit x86?
+  // TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
+  // say to.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
+         CRYPTO_is_FXSR_capable();
+}
+void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
+                                 size_t num);
+
+// TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
+#define SHA512_ASM
+void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
+                             size_t num_blocks);
+
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+
+#define SHA1_ASM_NOHW
+#define SHA256_ASM_NOHW
+#define SHA512_ASM_NOHW
+
+#define SHA1_ASM_HW
+OPENSSL_INLINE int sha1_hw_capable(void) {
+  return CRYPTO_is_SHAEXT_capable() && CRYPTO_is_SSSE3_capable();
+}
+
+#define SHA1_ASM_AVX2
+OPENSSL_INLINE int sha1_avx2_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX2 imply SSSE3?
+  //  * sha1_block_data_order_avx2 does not seem to use SSSE3 instructions.
+  return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() &&
+         CRYPTO_is_BMI1_capable() && CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_avx2(uint32_t state[5], const uint8_t *data,
+                                size_t num);
+
+#define SHA1_ASM_AVX
+OPENSSL_INLINE int sha1_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha1_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
+                               size_t num);
+
+#define SHA1_ASM_SSSE3
+OPENSSL_INLINE int sha1_ssse3_capable(void) {
+  return CRYPTO_is_SSSE3_capable();
+}
+void sha1_block_data_order_ssse3(uint32_t state[5], const uint8_t *data,
+                                 size_t num);
+
+#define SHA256_ASM_HW
+OPENSSL_INLINE int sha256_hw_capable(void) {
+  return CRYPTO_is_SHAEXT_capable();
+}
+
+#define SHA256_ASM_AVX
+OPENSSL_INLINE int sha256_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha256_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
+                                 size_t num);
+
+#define SHA256_ASM_SSSE3
+OPENSSL_INLINE int sha256_ssse3_capable(void) {
+  return CRYPTO_is_SSSE3_capable();
+}
+void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
+                                   size_t num);
+
+#define SHA512_ASM_AVX
+OPENSSL_INLINE int sha512_avx_capable(void) {
+  // TODO: Simplify this logic, which was extracted from the assembly:
+  //  * Does AVX imply SSSE3?
+  //  * sha512_block_data_order_avx does not seem to use SSSE3 instructions.
+  // Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
+  // discussion in sha1-586.pl.
+  return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
+         CRYPTO_is_intel_cpu();
+}
+void sha512_block_data_order_avx(uint64_t state[8], const uint8_t *data,
+                                 size_t num);
+
+#endif
+
+#if defined(SHA1_ASM_HW)
+void sha1_block_data_order_hw(uint32_t state[5], const uint8_t *data,
+                              size_t num);
+#endif
+#if defined(SHA1_ASM_NOHW)
+void sha1_block_data_order_nohw(uint32_t state[5], const uint8_t *data,
+                                size_t num);
+#endif
+
+#if defined(SHA256_ASM_HW)
+void sha256_block_data_order_hw(uint32_t state[8], const uint8_t *data,
+                                size_t num);
+#endif
+#if defined(SHA256_ASM_NOHW)
+void sha256_block_data_order_nohw(uint32_t state[8], const uint8_t *data,
+                                  size_t num);
+#endif
+
+#if defined(SHA512_ASM_HW)
+void sha512_block_data_order_hw(uint64_t state[8], const uint8_t *data,
+                                size_t num);
+#endif
+
+#if defined(SHA512_ASM_NOHW)
+void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *data,
+                                  size_t num);
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(OPENSSL_AARCH64)
+#define KECCAK1600_ASM
+#if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)
+#define KECCAK1600_S2N_BIGNUM_ASM
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
+#endif
+#endif
+#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+#if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)
+#define KECCAK1600_ASM
+#define KECCAK1600_S2N_BIGNUM_ASM
+#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
+#endif
+#endif
+#endif
+
+// SHAx_Init_from_state is a low-level function that initializes |sha| with a
+// custom state. |h| is the hash state in big endian. |n| is the number of bits
+// processed at this point. It must be a multiple of |SHAy_CBLOCK*8|,
+// where SHAy=SHA1 if SHAx=SHA1, SHAy=SHA256 if SHAx=SHA224 or SHA256, and
+// SHAy=SHA512 otherwise.
+// This function returns one on success and zero on error.
+// This function is for internal use only and should never be directly called.
+OPENSSL_EXPORT int SHA1_Init_from_state(
+    SHA_CTX *sha, const uint8_t h[SHA1_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA224_Init_from_state(
+    SHA256_CTX *sha, const uint8_t h[SHA224_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA256_Init_from_state(
+    SHA256_CTX *sha, const uint8_t h[SHA256_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA384_Init_from_state(
+    SHA512_CTX *sha, const uint8_t h[SHA384_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA512_Init_from_state(
+    SHA512_CTX *sha, const uint8_t h[SHA512_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA512_224_Init_from_state(
+    SHA512_CTX *sha, const uint8_t h[SHA512_224_CHAINING_LENGTH], uint64_t n);
+OPENSSL_EXPORT int SHA512_256_Init_from_state(
+    SHA512_CTX *sha, const uint8_t h[SHA512_256_CHAINING_LENGTH], uint64_t n);
+
+// SHAx_get_state is a low-level function that exports the hash state in big
+// endian into |out_h| and the number of bits processed at this point in
+// |out_n|. |SHAx_Final| must not have been called before (otherwise results
+// are not guaranteed). Furthermore, the number of bytes processed by
+// |SHAx_Update| must be a multiple of the block length |SHAy_CBLOCK| and
+// must be less than 2^61 (otherwise it fails). See comment above about
+// |SHAx_Init_from_state| for the definition of SHAy.
+// This function returns one on success and zero on error.
+// This function is for internal use only and should never be directly called.
+OPENSSL_EXPORT int SHA1_get_state(
+    SHA_CTX *ctx, uint8_t out_h[SHA1_CHAINING_LENGTH], uint64_t *out_n);
+OPENSSL_EXPORT int SHA224_get_state(
+    SHA256_CTX *ctx, uint8_t out_h[SHA224_CHAINING_LENGTH], uint64_t *out_n);
+OPENSSL_EXPORT int SHA256_get_state(
+    SHA256_CTX *ctx, uint8_t out_h[SHA256_CHAINING_LENGTH], uint64_t *out_n);
+OPENSSL_EXPORT int SHA384_get_state(
+    SHA512_CTX *ctx, uint8_t out_h[SHA384_CHAINING_LENGTH], uint64_t *out_n);
+OPENSSL_EXPORT int SHA512_get_state(
+    SHA512_CTX *ctx, uint8_t out_h[SHA512_CHAINING_LENGTH], uint64_t *out_n);
+OPENSSL_EXPORT int SHA512_224_get_state(
+    SHA512_CTX *ctx, uint8_t out_h[SHA512_224_CHAINING_LENGTH],
+    uint64_t *out_n);
+OPENSSL_EXPORT int SHA512_256_get_state(
+    SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENGTH],
+    uint64_t *out_n);
+
+/*
+ * SHA3/SHAKE single-shot APIs implement SHA3 functionalities on top
+ * of SHA3/SHAKE API layer
+ *
+ * SHA3/SHAKE single-shot functions never fail when the later call-discipline is
+ * adhered to: (a) the pointers passed to the functions are valid.
+ */
+
+// SHA3_224 writes the digest of |len| bytes from |data| to |out| and returns |out|.
+// There must be at least |SHA3_224_DIGEST_LENGTH| bytes of space in |out|.
+// On failure |SHA3_224| returns NULL.
+OPENSSL_EXPORT uint8_t *SHA3_224(const uint8_t *data, size_t len,
+                                 uint8_t out[SHA3_224_DIGEST_LENGTH]);
+
+// SHA3_256 writes the digest of |len| bytes from |data| to |out| and returns |out|.
+// There must be at least |SHA3_256_DIGEST_LENGTH| bytes of space in |out|.
+// On failure |SHA3_256| returns NULL.
+OPENSSL_EXPORT uint8_t *SHA3_256(const uint8_t *data, size_t len,
+                                 uint8_t out[SHA3_256_DIGEST_LENGTH]);
+
+// SHA3_384 writes the digest of |len| bytes from |data| to |out| and returns |out|.
+// There must be at least |SHA3_384_DIGEST_LENGTH| bytes of space in |out|.
+// On failure |SHA3_384| returns NULL.
+OPENSSL_EXPORT uint8_t *SHA3_384(const uint8_t *data, size_t len,
+                                 uint8_t out[SHA3_384_DIGEST_LENGTH]);
+
+// SHA3_512 writes the digest of |len| bytes from |data| to |out| and returns |out|.
+// There must be at least |SHA3_512_DIGEST_LENGTH| bytes of space in |out|.
+// On failure |SHA3_512| returns NULL.
+OPENSSL_EXPORT uint8_t *SHA3_512(const uint8_t *data, size_t len,
+                  uint8_t out[SHA3_512_DIGEST_LENGTH]);
+
+// SHAKE128 writes the |out_len| bytes output from |in_len| bytes |data|
+// to |out| and returns |out| on success and NULL on failure.
+OPENSSL_EXPORT uint8_t *SHAKE128(const uint8_t *data, const size_t in_len,
+                                 uint8_t *out, size_t out_len);
+
+// SHAKE256 writes |out_len| bytes output from |in_len| bytes |data|
+// to |out| and returns |out| on success and NULL on failure.
+OPENSSL_EXPORT uint8_t *SHAKE256(const uint8_t *data, const size_t in_len,
+                                 uint8_t *out, size_t out_len);
+/*
+ * SHA3 APIs implement SHA3 functionalities on top of FIPS202 API layer
+ *
+ * SHA3 context must go through the flow: (a) Init, (b) Update [multiple times],
+ * (c) Final [one time].
+ *
+ * SHA3 functions never fail when the later call-discipline is adhered to:
+ * (a) the context execution flow is followed (b) the pointers passed to the
+ * functions are valid (c) any additional per-function parameter value conditions,
+ * detailed above each SHA3_ function signature, is satisfied.
+ */
+
+// SHA3_Init initialises |ctx| field through |FIPS202_Init| and
+// returns 1 on success and 0 on failure. When call-discipline is
+// maintained and |bitlen| value corresponds to a SHA3 digest length
+// in bits, this function never fails.
+OPENSSL_EXPORT int SHA3_Init(KECCAK1600_CTX *ctx, size_t bitlen);
+
+// SHA3_Update checks |ctx| pointer and |len| value, calls |FIPS202_Update|
+// and returns 1 on success and 0 on failure. When call-discipline is
+// maintained and |len| value corresponds to the input message length
+// (including zero), this function never fails.
+int SHA3_Update(KECCAK1600_CTX *ctx, const void *data, size_t len);
+
+// SHA3_Final pads the last data block and absorbs it through |FIPS202_Finalize|.
+// It then calls |Keccak1600_Squeeze| and returns 1 on success and 0 on failure.
+// When call-discipline is maintained, this function never fails.
+int SHA3_Final(uint8_t *md, KECCAK1600_CTX *ctx);
+
+// SHA3_224_Init initialises |sha| and returns 1.
+int SHA3_224_Init(KECCAK1600_CTX *sha);
+
+// SHA3_224_Update adds |len| bytes from |data| to |sha| and returns 1.
+int SHA3_224_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
+
+// SHA3_224_Final adds the final padding to |sha| and writes the resulting
+// digest to |out|. It returns one on success and zero on programmer error.
+int SHA3_224_Final(uint8_t out[SHA3_224_DIGEST_LENGTH], KECCAK1600_CTX *sha);
+
+// SHA3_256_Init initialises |sha| and returns 1.
+int SHA3_256_Init(KECCAK1600_CTX *sha);
+
+// SHA3_256_Update adds |len| bytes from |data| to |sha| and returns 1.
+int SHA3_256_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
+
+// SHA3_256_Final adds the final padding to |sha| and writes the resulting
+// digest to |out|. It returns one on success and zero on programmer error.
+int SHA3_256_Final(uint8_t out[SHA3_256_DIGEST_LENGTH], KECCAK1600_CTX *sha);
+
+// SHA3_384_Init initialises |sha| and returns 1.
+int SHA3_384_Init(KECCAK1600_CTX *sha);
+
+// SHA3_384_Update adds |len| bytes from |data| to |sha| and returns 1.
+int SHA3_384_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
+
+// SHA3_384_Final adds the final padding to |sha| and writes the resulting
+// digest to |out|. It returns one on success and zero on programmer error.
+int SHA3_384_Final(uint8_t out[SHA3_384_DIGEST_LENGTH], KECCAK1600_CTX *sha);
+
+// SHA3_512_Init initialises |sha| and returns 1.
+int SHA3_512_Init(KECCAK1600_CTX *sha);
+
+// SHA3_512_Update adds |len| bytes from |data| to |sha| and returns 1.
+int SHA3_512_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
+
+// SHA3_512_Final adds the final padding to |sha| and writes the resulting
+// digest to |out|. It returns one on success and zero on programmer error.
+int SHA3_512_Final(uint8_t out[SHA3_512_DIGEST_LENGTH], KECCAK1600_CTX *sha);
+
+/*
+ * SHAKE APIs implement SHAKE functionalities on top of FIPS202 API layer
+ *
+ * SHAKE context must go through the flow: (a) Init, (b) Absorb [multiple times],
+ * (c) Final [one time] or Squeeze [multiple times]
+ *
+ * SHAKE functions never fail when the later call-discipline is adhered to:
+ * (a) the context execution flow is followed (b) the pointers passed to the
+ * functions are valid (c) any additional per-function parameter value conditions,
+ * detailed above each SHAKE_ function signature, is satisfied.
+ */
+
+// SHAKE_Init initialises |ctx| fields through |FIPS202_Init| and
+// returns 1 on success and 0 on failure. When call-discipline is
+// maintained and |block_size| value corresponds to a SHAKE block size length
+// in bytes, this function never fails.
+int SHAKE_Init(KECCAK1600_CTX *ctx, size_t block_size);
+
+// SHAKE_Absorb checks |ctx| pointer and |len| values. It updates and absorbs
+// input blocks via |FIPS202_Update|. When call-discipline is
+// maintained and |len| value corresponds to the input message length
+// (including zero), this function never fails.
+int SHAKE_Absorb(KECCAK1600_CTX *ctx, const void *data,
+                               size_t len);
+
+// SHAKE_Squeeze pads the last data block and absorbs it through
+// |FIPS202_Finalize| on first call. It writes |len| bytes of incremental
+// XOF output to |md| and returns 1 on success and 0 on failure. It can be
+// called multiple times. When call-discipline is maintained, this function
+// never fails.
+int SHAKE_Squeeze(uint8_t *md, KECCAK1600_CTX *ctx, size_t len);
+
+// SHAKE_Final writes |len| bytes of finalized extendible output to |md|, returns 1 on
+// success and 0 on failure. It should be called once to finalize absorb and
+// squeeze phases. Incremental XOF output should be generated via |SHAKE_Squeeze|.
+// When call-discipline is maintained, this function never fails.
+int SHAKE_Final(uint8_t *md, KECCAK1600_CTX *ctx, size_t len);
+
+/*
+ * SHAKE128_x4_ batched APIs implement x4 SHAKE functionalities on top of FIPS202 API layer
+ *
+ * SHAKE128_x4_ context must go through the flow: (a) Init_x4, (b) Absorb_once_x4 [one time;
+ * maximum input length of |SHAKE128_BLOCKSIZE - 1|] (c) Squeezeblocks_x4 [multiple times]
+ *
+ * SHAKE128_x4_ functions never fail when the later call-discipline is adhered to:
+ * (a) the context execution flow is followed (b) the pointers passed to the
+ * functions are valid (c) any additional per-function parameter value conditions,
+ * detailed above each SHAKE128_x4_ function signature, is satisfied.
+ */
+
+// SHAKE128_Init_x4 is a batched API that operates on four independent
+// Keccak bitstates. It initialises all four |ctx| fields and returns
+// 1 on success and 0 on failure. When call-discipline is maintained,
+// this function never fails.
+OPENSSL_EXPORT int SHAKE128_Init_x4(KECCAK1600_CTX_x4 *ctx);
+
+// SHAKE128_Absorb_once_x4 is a batched API that operates on four independent
+// Keccak bitstates. It absorbs all four inputs |data0|, |data1|, |data2|, |data3|
+// of equal length of |len| bytes returns 1 on success and 0 on failure. When
+// is maintained and |len| value corresponds to the input messages length
+// call-discipline (including zero), this function never fails.
+OPENSSL_EXPORT int SHAKE128_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
+                                  const void *data2, const void *data3, size_t len);
+
+// SHAKE128_Squeezeblocks_x4 is a batched API that operates on four independent Keccak
+// bitstates. It squeezes |blks| number of blocks for all four XOF digests and returns
+// 1 on success and 0 on failure. When call-discipline is maintained, this function
+// never fails.
+OPENSSL_EXPORT int SHAKE128_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
+                                  KECCAK1600_CTX_x4 *ctx, size_t blks);
+/*
+ * SHAKE256_x4_ signle-shot batched API implements x4 SHAKE256 functionalities on top
+ * of FIPS202 API layer
+ *
+ * SHAKE256_x4_ function never fails when the later call-discipline is adhered to:
+ * (a) the pointers passed to the functions are valid.
+ */
+
+// SHAKE256_x4 is a batched API that operates on four independent
+// Keccak bitstates. It writes all four |out_len|-byte outputs from
+// |in_len|-byte inputs to |out0|, |out1|, |out2|, |out3| and returns
+// 1 on success and 0 on failure.
+// When call-discipline is maintained, this function never fails.
+OPENSSL_EXPORT int SHAKE256_x4(const uint8_t *data0, const uint8_t *data1,
+                                  const uint8_t *data2, const uint8_t *data3,
+                                  const size_t in_len, uint8_t *out0, uint8_t *out1,
+                                  uint8_t *out2, uint8_t *out3, size_t out_len);
+
+/*
+ * Keccak1600_ APIs implement Keccak absorb and squeeze phases
+ */
+
+// Keccak1600_Absorb processes the largest multiple of |r| (block size) out of
+// |len| bytes and returns the remaining number of bytes.
+size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS],
+                                  const uint8_t *data, size_t len, size_t r);
+
+// Keccak1600_Absorb_once_x4 absorbs exactly |len| bytes from four inputs into four
+// Keccak states, applying padding character |p|. Unlike Keccak1600_Absorb, this
+// processes a single block and takes the padding character as an additional argument.
+void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
+                               const uint8_t *inp0, const uint8_t *inp1,
+                               const uint8_t *inp2, const uint8_t *inp3,
+                               size_t len, size_t r, uint8_t p);
+
+// Keccak1600_Squeezeblocks_x4 squeezes |num_blocks| blocks from four Keccak states
+// into four output buffers, with each block being |r| bytes.
+void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
+                                 uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3,
+                                 size_t num_blocks, size_t r);
+
+// Keccak1600_Squeeze generates |out| value of |len| bytes (per call). It can be called
+// multiple times when used as eXtendable Output Function. |padded| indicates
+// whether it is the first call to Keccak1600_Squeeze; i.e., if the current block has
+// been already processed and padded right after the last call to Keccak1600_Absorb.
+// Squeezes full blocks of |r| bytes each. When performing multiple squeezes, any
+// left over bytes from previous squeezes are not consumed, and |len| must be a
+// multiple of the block size (except on the final squeeze).
+OPENSSL_EXPORT void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS],
+                                 uint8_t *out, size_t len, size_t r, int padded);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif
+
+#endif  // OPENSSL_HEADER_SHA_INTERNAL_H
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/keccak1600.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/keccak1600.c
@@ -0,0 +1,515 @@
+// Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <assert.h>
+#include "internal.h"
+#include "../../internal.h"
+#include "../cpucap/internal.h"
+
+static const uint64_t iotas[] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL
+};
+
+#if !defined(KECCAK1600_ASM)
+
+static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = {
+    {  0,  1, 62, 28, 27 },
+    { 36, 44,  6, 55, 20 },
+    {  3, 10, 43, 25, 39 },
+    { 41, 45, 15, 21,  8 },
+    { 18,  2, 61, 56, 14 }
+};
+
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
+    (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
+    defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__loongarch__) || \
+    defined(__EMSCRIPTEN__)
+
+ // These platforms don't support "logical and with complement" instruction.
+# define KECCAK_COMPLEMENTING_TRANSFORM
+#endif
+
+static uint64_t ROL64(uint64_t val, int offset) {
+    if (offset == 0) {
+        return val;
+    } else {
+        return (val << offset) | (val >> (64-offset));
+    }
+}
+
+ // KECCAK_2X:
+ // This is the default implementation used in OpenSSL and the most efficient;
+ // the other implementations were removed from this file.
+ // This implementation is a variant of KECCAK_1X (see OpenSSL)
+ // This implementation allows to take temporary storage
+ // out of round procedure and simplify references to it by alternating
+ // it with actual data (see round loop below).
+ // It ensures best compiler interpretation to assembly and provides best
+ // instruction per processed byte ratio at minimal round unroll factor.
+static void Round(uint64_t R[KECCAK1600_ROWS][KECCAK1600_ROWS], uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], size_t i) {
+    uint64_t C[KECCAK1600_ROWS], D[KECCAK1600_ROWS];
+
+    assert(i < (sizeof(iotas) / sizeof(iotas[0])));
+
+    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
+    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
+    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
+    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
+    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
+
+    D[0] = ROL64(C[1], 1) ^ C[4];
+    D[1] = ROL64(C[2], 1) ^ C[0];
+    D[2] = ROL64(C[3], 1) ^ C[1];
+    D[3] = ROL64(C[4], 1) ^ C[2];
+    D[4] = ROL64(C[0], 1) ^ C[3];
+
+    C[0] =       A[0][0] ^ D[0];
+    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
+    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
+    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
+    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
+    R[0][1] = C[1] ^ (~C[2] | C[3]);
+    R[0][2] = C[2] ^ ( C[3] & C[4]);
+    R[0][3] = C[3] ^ ( C[4] | C[0]);
+    R[0][4] = C[4] ^ ( C[0] & C[1]);
+#else
+    R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
+    R[0][1] = C[1] ^ (~C[2] & C[3]);
+    R[0][2] = C[2] ^ (~C[3] & C[4]);
+    R[0][3] = C[3] ^ (~C[4] & C[0]);
+    R[0][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+    C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
+    C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
+    C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
+    C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
+    C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    R[1][0] = C[0] ^ (C[1] |  C[2]);
+    R[1][1] = C[1] ^ (C[2] &  C[3]);
+    R[1][2] = C[2] ^ (C[3] | ~C[4]);
+    R[1][3] = C[3] ^ (C[4] |  C[0]);
+    R[1][4] = C[4] ^ (C[0] &  C[1]);
+#else
+    R[1][0] = C[0] ^ (~C[1] & C[2]);
+    R[1][1] = C[1] ^ (~C[2] & C[3]);
+    R[1][2] = C[2] ^ (~C[3] & C[4]);
+    R[1][3] = C[3] ^ (~C[4] & C[0]);
+    R[1][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+    C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
+    C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
+    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
+    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
+    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    R[2][0] =  C[0] ^ ( C[1] | C[2]);
+    R[2][1] =  C[1] ^ ( C[2] & C[3]);
+    R[2][2] =  C[2] ^ (~C[3] & C[4]);
+    R[2][3] = ~C[3] ^ ( C[4] | C[0]);
+    R[2][4] =  C[4] ^ ( C[0] & C[1]);
+#else
+    R[2][0] = C[0] ^ (~C[1] & C[2]);
+    R[2][1] = C[1] ^ (~C[2] & C[3]);
+    R[2][2] = C[2] ^ (~C[3] & C[4]);
+    R[2][3] = C[3] ^ (~C[4] & C[0]);
+    R[2][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+    C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
+    C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
+    C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
+    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
+    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    R[3][0] =  C[0] ^ ( C[1] & C[2]);
+    R[3][1] =  C[1] ^ ( C[2] | C[3]);
+    R[3][2] =  C[2] ^ (~C[3] | C[4]);
+    R[3][3] = ~C[3] ^ ( C[4] & C[0]);
+    R[3][4] =  C[4] ^ ( C[0] | C[1]);
+#else
+    R[3][0] = C[0] ^ (~C[1] & C[2]);
+    R[3][1] = C[1] ^ (~C[2] & C[3]);
+    R[3][2] = C[2] ^ (~C[3] & C[4]);
+    R[3][3] = C[3] ^ (~C[4] & C[0]);
+    R[3][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+
+    C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
+    C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
+    C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
+    C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
+    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    R[4][0] =  C[0] ^ (~C[1] & C[2]);
+    R[4][1] = ~C[1] ^ ( C[2] | C[3]);
+    R[4][2] =  C[2] ^ ( C[3] & C[4]);
+    R[4][3] =  C[3] ^ ( C[4] | C[0]);
+    R[4][4] =  C[4] ^ ( C[0] & C[1]);
+#else
+    R[4][0] = C[0] ^ (~C[1] & C[2]);
+    R[4][1] = C[1] ^ (~C[2] & C[3]);
+    R[4][2] = C[2] ^ (~C[3] & C[4]);
+    R[4][3] = C[3] ^ (~C[4] & C[0]);
+    R[4][4] = C[4] ^ (~C[0] & C[1]);
+#endif
+}
+
+static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
+    uint64_t T[KECCAK1600_ROWS][KECCAK1600_ROWS];
+    size_t i;
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    A[0][1] = ~A[0][1];
+    A[0][2] = ~A[0][2];
+    A[1][3] = ~A[1][3];
+    A[2][2] = ~A[2][2];
+    A[3][2] = ~A[3][2];
+    A[4][0] = ~A[4][0];
+#endif
+
+    for (i = 0; i < 24; i += 2) {
+        Round(T, A, i);
+        Round(A, T, i + 1);
+    }
+
+#ifdef KECCAK_COMPLEMENTING_TRANSFORM
+    A[0][1] = ~A[0][1];
+    A[0][2] = ~A[0][2];
+    A[1][3] = ~A[1][3];
+    A[2][2] = ~A[2][2];
+    A[3][2] = ~A[3][2];
+    A[4][0] = ~A[4][0];
+#endif
+}
+#endif // !KECCAK1600_ASM
+
+// Forward declaration for KeccakF1600 function
+void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]);
+
+ // Keccak1600_Absorb can be called multiple times; at each invocation the
+ // largest multiple of |r| out of |len| bytes are processed. The
+ // remaining amount of bytes is returned. This is done to spare caller
+ // trouble of calculating the largest multiple of |r|. |r| can be viewed
+ // as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
+ // 72, but can also be (1600 - 448)/8 = 144. All this means that message
+ // padding and intermediate sub-block buffering, byte- or bitwise, is
+ // caller's responsibility.
+
+// KeccakF1600_XORBytes XORs |len| bytes from |inp| into the Keccak state |A|.
+// |len| must be a multiple of 8.
+static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len) {
+    assert(len <= SHA3_MAX_BLOCKSIZE);
+    assert((len % 8) == 0);
+
+    uint64_t *A_flat = (uint64_t *)A;
+    size_t w = len / 8;
+
+    for (size_t i = 0; i < w; i++) {
+        uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
+                      (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
+                      (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
+                      (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
+        inp += 8;
+        A_flat[i] ^= Ai;
+    }
+}
+
+size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len,
+                         size_t r) {
+    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
+
+    while (len >= r) {
+        KeccakF1600_XORBytes(A, inp, r);
+        KeccakF1600(A);
+        inp += r;
+        len -= r;
+    }
+
+    return len;
+}
+
+// KeccakF1600_ExtractBytes extracts |len| bytes from the Keccak state |A| into |out|.
+// This function operates on up to block_size bytes (a single block). For extracting
+// more data, the state must be processed again through KeccakF1600 (see Keccak1600_Squeeze).
+static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len) {
+    uint64_t *A_flat = (uint64_t *)A;
+    assert(len <= SHA3_MAX_BLOCKSIZE);
+    size_t i = 0;
+
+    while (len != 0) {
+        uint64_t Ai = A_flat[i];
+
+        if (len < 8) {
+            for (size_t j = 0; j < len; j++) {
+                *out++ = (uint8_t)Ai;
+                Ai >>= 8;
+            }
+            return;
+        }
+
+        out[0] = (uint8_t)(Ai);
+        out[1] = (uint8_t)(Ai >> 8);
+        out[2] = (uint8_t)(Ai >> 16);
+        out[3] = (uint8_t)(Ai >> 24);
+        out[4] = (uint8_t)(Ai >> 32);
+        out[5] = (uint8_t)(Ai >> 40);
+        out[6] = (uint8_t)(Ai >> 48);
+        out[7] = (uint8_t)(Ai >> 56);
+        out += 8;
+        len -= 8;
+        i++;
+    }
+}
+
+void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len, size_t r, int padded) {
+    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
+
+    while (len != 0) {
+        if (padded) {
+            KeccakF1600(A);
+        }
+        padded = 1;
+
+        size_t extract_len = len < r ? len : r;
+        KeccakF1600_ExtractBytes(A, out, extract_len);
+        out += extract_len;
+        len -= extract_len;
+    }
+}
+
+#if defined(KECCAK1600_ASM)
+
+// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl
+extern void KeccakF1600_hw(uint64_t state[25]);
+
+#if defined(OPENSSL_AARCH64)
+static void keccak_log_dispatch(size_t id) {
+#if BORINGSSL_DISPATCH_TEST
+    BORINGSSL_function_hit[id] = 1;
+#endif
+}
+#endif
+
+void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
+    // Dispatch logic for Keccak-x1 on AArch64:
+    //
+    // 1. If ASM is disabled, we use the C implementation.
+    // 2. If ASM is enabled:
+    //   - For Neoverse N1, V1, V2, we use scalar Keccak assembly from s2n-bignum
+    //     (`sha3_keccak_f1600()`)
+    //     leveraging lazy rotations from https://eprint.iacr.org/2022/1243.
+    //   - Otherwise, if the Neon SHA3 extension is supported, we use the Neon
+    //     Keccak assembly from s2n-bignum (`sha3_keccak_f1600_alt()`),
+    //     leveraging that extension.
+    //   - Otherwise, fall back to scalar Keccak implementation from OpenSSL,
+    //     (`Keccak1600_hw()`), not using lazy rotations.
+    //
+    // Lazy rotations improve performance by up to 10% on CPUs with free
+    // Barrel shifting, which includes Neoverse N1, V1, and V2. Not all
+    // CPUs have free Barrel shifting (e.g. Apple M1 or Cortex-A72), so we
+    // don't use it by default.
+    //
+    // Neoverse V1 and V2 do support SHA3 instructions, but they are only
+    // implemented on 1/4 of Neon units, and are thus slower than a scalar
+    // implementation.
+#if defined(OPENSSL_AARCH64)
+#if defined(KECCAK1600_S2N_BIGNUM_ASM)
+    if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
+        keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600
+        sha3_keccak_f1600((uint64_t *)A, iotas);
+        return;
+    }
+
+#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
+    if (CRYPTO_is_ARMv8_SHA3_capable()) {
+        keccak_log_dispatch(11); // kFlag_sha3_keccak_f1600_alt
+        sha3_keccak_f1600_alt((uint64_t *)A, iotas);
+        return;
+    }
+#endif
+#endif
+
+    keccak_log_dispatch(9); // kFlag_KeccakF1600_hw
+    KeccakF1600_hw((uint64_t *) A);
+
+#elif defined(OPENSSL_X86_64)
+    sha3_keccak_f1600((uint64_t *)A, iotas);
+#endif
+}
+
+#else // KECCAK1600_ASM
+
+void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS])
+{
+    KeccakF1600_c(A);
+}
+
+#endif // !KECCAK1600_ASM
+
+// KeccakF1600_XORBytes_x4 XORs |len| bytes from |inp0|, |inp1|, |inp2|, |inp3|
+// into the four Keccak states in |A|. |len| must be a multiple of 8.
+static void KeccakF1600_XORBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
+                                    const uint8_t *inp0, const uint8_t *inp1,
+                                    const uint8_t *inp2, const uint8_t *inp3,
+                                    size_t len) {
+    KeccakF1600_XORBytes(A[0], inp0, len);
+    KeccakF1600_XORBytes(A[1], inp1, len);
+    KeccakF1600_XORBytes(A[2], inp2, len);
+    KeccakF1600_XORBytes(A[3], inp3, len);
+}
+
+// KeccakF1600_ExtractBytes_x4 extracts |len| bytes from the four Keccak states in |A|
+// into |out0|, |out1|, |out2|, |out3|.
+static void KeccakF1600_ExtractBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
+                                        uint8_t *out0, uint8_t *out1,
+                                        uint8_t *out2, uint8_t *out3,
+                                        size_t len) {
+    KeccakF1600_ExtractBytes(A[0], out0, len);
+    KeccakF1600_ExtractBytes(A[1], out1, len);
+    KeccakF1600_ExtractBytes(A[2], out2, len);
+    KeccakF1600_ExtractBytes(A[3], out3, len);
+}
+
+static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) {
+    // Dispatch logic for Keccak-x4 on AArch64:
+    //
+    // 1. If ASM is disabled, we use 4x the C implementation.
+    // 2. If ASM is enabled:
+    // - For Neoverse N1, we use scalar batched hybrid Keccak assembly from s2n-bignum
+    //   (`sha3_keccak4_f1600_alt()`) leveraging Neon and scalar assembly with
+    //   lazy rotations.
+    // - For Neoverse V1, V2, we use SIMD batched hybrid Keccak assembly from s2n-bignum
+    //   (`sha3_keccak4_f1600_alt2()`) leveraging Neon, Neon SHA3 extension,
+    //   and scalar assembly with lazy rotations.
+    // - Otherwise, if the Neon SHA3 extension is supported, we use the 2-fold
+    //   Keccak assembly from s2n-bignum (`sha3_keccak2_f1600()`) twice,
+    //   which is a straightforward implementation using the SHA3 extension.
+    // - Otherwise, fall back to four times the 1-fold Keccak implementation
+    //   (which has its own dispatch logic).
+#if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64)
+    if (CRYPTO_is_Neoverse_N1()) {
+        keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt
+        sha3_keccak4_f1600_alt((uint64_t *)A, iotas);
+        return;
+    }
+
+#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
+    if (CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
+        keccak_log_dispatch(14); // kFlag_sha3_keccak4_f1600_alt2
+        sha3_keccak4_f1600_alt2((uint64_t *)A, iotas);
+        return;
+    }
+
+    if (CRYPTO_is_ARMv8_SHA3_capable()) {
+        keccak_log_dispatch(12); // kFlag_sha3_keccak2_f1600
+        // Use 2-fold function twice: A[0:1] and A[2:3]
+        sha3_keccak2_f1600((uint64_t *)&A[0], iotas);
+        sha3_keccak2_f1600((uint64_t *)&A[2], iotas);
+        return;
+    }
+#endif
+#endif
+
+    // Fallback: 4x individual KeccakF1600 calls (each with their own dispatch)
+    KeccakF1600(A[0]);
+    KeccakF1600(A[1]);
+    KeccakF1600(A[2]);
+    KeccakF1600(A[3]);
+}
+
+// One-shot absorb + finalize. Note that in contract to non-batched Keccak,
+// this does _not_ run a Keccak permutation at the end, allowing for a uniform
+// implementation of Keccak1600_Squeezeblocks_x4() without `padded` parameter
+// as in the non-batched implementation.
+void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
+                               const uint8_t *inp0, const uint8_t *inp1,
+                               const uint8_t *inp2, const uint8_t *inp3,
+                               size_t len, size_t r, uint8_t p) {
+    assert(r <= SHA3_MAX_BLOCKSIZE);
+
+    while (len >= r) {
+        KeccakF1600_XORBytes_x4(A, inp0, inp1, inp2, inp3, r);
+        Keccak1600_x4(A);
+        inp0 += r;
+        inp1 += r;
+        inp2 += r;
+        inp3 += r;
+        len -= r;
+    }
+
+    // Build 16-byte aligned final blocks for each input
+    alignas(16) uint8_t final[4][SHA3_MAX_BLOCKSIZE] = {{0}};
+
+    // Copy the remainder bytes to final blocks
+    OPENSSL_memcpy(final[0], inp0, len);
+    OPENSSL_memcpy(final[1], inp1, len);
+    OPENSSL_memcpy(final[2], inp2, len);
+    OPENSSL_memcpy(final[3], inp3, len);
+
+    if (len == r - 1) {
+        p |= 128;
+    } else {
+        final[0][r - 1] |= 128;
+        final[1][r - 1] |= 128;
+        final[2][r - 1] |= 128;
+        final[3][r - 1] |= 128;
+    }
+
+    final[0][len] |= p;
+    final[1][len] |= p;
+    final[2][len] |= p;
+    final[3][len] |= p;
+
+    KeccakF1600_XORBytes_x4(A, final[0], final[1], final[2], final[3], r);
+
+    // Clean up final blocks to avoid stack leakage
+    OPENSSL_cleanse(final, sizeof(final));
+}
+
+void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out0, uint8_t *out1,
+                                 uint8_t *out2, uint8_t *out3,
+                                 size_t num_blocks, size_t r) {
+    while (num_blocks != 0) {
+        Keccak1600_x4(A);
+        KeccakF1600_ExtractBytes_x4(A, out0, out1, out2, out3, r);
+
+        out0 += r;
+        out1 += r;
+        out2 += r;
+        out3 += r;
+        num_blocks--;
+    }
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha1-altivec.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha1-altivec.c
@@ -0,0 +1,306 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Altivec-optimized SHA1 in C. This is tested on ppc64le only.
+//
+// References:
+// https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+// http://arctic.org/~dean/crypto/sha1.html
+//
+// This code used the generic SHA-1 from OpenSSL as a basis and AltiVec
+// optimisations were added on top.
+
+#include <openssl/sha.h>
+
+#if defined(OPENSSL_PPC64LE)
+
+#include <altivec.h>
+
+static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); }
+
+typedef vector unsigned int vec_uint32_t;
+typedef vector unsigned char vec_uint8_t;
+
+// Vector constants
+static const vec_uint8_t k_swap_endianness = {3,  2,  1, 0, 7,  6,  5,  4,
+                                              11, 10, 9, 8, 15, 14, 13, 12};
+
+// Shift amounts for byte and bit shifts and rotations
+static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32,
+                                      32, 32, 32, 32, 32, 32, 32, 32};
+static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96,
+                                       96, 96, 96, 96, 96, 96, 96, 96};
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+// Vector versions of the above.
+static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19};
+static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39};
+static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59};
+static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79};
+
+// vector message scheduling: compute message schedule for round i..i+3 where i
+// is divisible by 4. We return the schedule w[i..i+3] as a vector. In
+// addition, we also precompute sum w[i..+3] and an additive constant K. This
+// is done to offload some computation of f() in the integer execution units.
+//
+// Byte shifting code below may not be correct for big-endian systems.
+static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data,
+                                vec_uint32_t k) {
+  const vector unsigned char unaligned_data =
+    vec_vsx_ld(0, (const unsigned char*) data);
+  const vec_uint32_t v = (vec_uint32_t) unaligned_data;
+  const vec_uint32_t w = vec_perm(v, v, k_swap_endianness);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+// Compute w[i..i+3] using these steps for i in [16, 20, 24, 28]
+//
+// w'[i  ]  = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1
+// w'[i+1]  = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1
+// w'[i+2]  = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1
+// w'[i+3]  = (     0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1
+//
+// w[  i] = w'[  i]
+// w[i+1] = w'[i+1]
+// w[i+2] = w'[i+2]
+// w[i+3] = w'[i+3] ^ (w'[i] <<< 1)
+static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
+                                vec_uint32_t minus_8, vec_uint32_t minus_12,
+                                vec_uint32_t minus_16, vec_uint32_t k) {
+  const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
+  const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8);
+  const vec_uint32_t k_1_bit = vec_splat_u32(1);
+  const vec_uint32_t w_prime =
+      vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
+  const vec_uint32_t w =
+      w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+// Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76]
+// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2
+static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
+                                vec_uint32_t minus_8, vec_uint32_t minus_16,
+                                vec_uint32_t minus_28, vec_uint32_t minus_32,
+                                vec_uint32_t k) {
+  const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8);
+  const vec_uint32_t k_2_bits = vec_splat_u32(2);
+  const vec_uint32_t w =
+      vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
+  vec_st(w + k, 0, pre_added);
+  return w;
+}
+
+// As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
+// to the code in F_00_19. Wei attributes these optimisations to Peter
+// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
+// F(x,y,z) (((x) & (y))  |  ((~(x)) & (z))) I've just become aware of another
+// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
+#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
+#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
+#define F_60_79(b, c, d) F_20_39(b, c, d)
+
+// We pre-added the K constants during message scheduling.
+#define BODY_00_19(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_20_39(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_40_59(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+#define BODY_60_79(i, a, b, c, d, e, f)                         \
+  do {                                                          \
+    (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
+    (b) = rotate((b), 30);                                      \
+  } while (0)
+
+void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
+  uint32_t A, B, C, D, E, T;
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+  E = state[4];
+
+  for (;;) {
+    vec_uint32_t vw[20];
+    const uint32_t *w = (const uint32_t *)&vw;
+
+    vec_uint32_t k = K_00_19_x_4;
+    const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k);
+    BODY_00_19(0, A, B, C, D, E, T);
+    BODY_00_19(1, T, A, B, C, D, E);
+    BODY_00_19(2, E, T, A, B, C, D);
+    BODY_00_19(3, D, E, T, A, B, C);
+
+    const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k);
+    BODY_00_19(4, C, D, E, T, A, B);
+    BODY_00_19(5, B, C, D, E, T, A);
+    BODY_00_19(6, A, B, C, D, E, T);
+    BODY_00_19(7, T, A, B, C, D, E);
+
+    const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k);
+    BODY_00_19(8, E, T, A, B, C, D);
+    BODY_00_19(9, D, E, T, A, B, C);
+    BODY_00_19(10, C, D, E, T, A, B);
+    BODY_00_19(11, B, C, D, E, T, A);
+
+    const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k);
+    BODY_00_19(12, A, B, C, D, E, T);
+    BODY_00_19(13, T, A, B, C, D, E);
+    BODY_00_19(14, E, T, A, B, C, D);
+    BODY_00_19(15, D, E, T, A, B, C);
+
+    const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k);
+    BODY_00_19(16, C, D, E, T, A, B);
+    BODY_00_19(17, B, C, D, E, T, A);
+    BODY_00_19(18, A, B, C, D, E, T);
+    BODY_00_19(19, T, A, B, C, D, E);
+
+    k = K_20_39_x_4;
+    const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k);
+    BODY_20_39(20, E, T, A, B, C, D);
+    BODY_20_39(21, D, E, T, A, B, C);
+    BODY_20_39(22, C, D, E, T, A, B);
+    BODY_20_39(23, B, C, D, E, T, A);
+
+    const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k);
+    BODY_20_39(24, A, B, C, D, E, T);
+    BODY_20_39(25, T, A, B, C, D, E);
+    BODY_20_39(26, E, T, A, B, C, D);
+    BODY_20_39(27, D, E, T, A, B, C);
+
+    const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k);
+    BODY_20_39(28, C, D, E, T, A, B);
+    BODY_20_39(29, B, C, D, E, T, A);
+    BODY_20_39(30, A, B, C, D, E, T);
+    BODY_20_39(31, T, A, B, C, D, E);
+
+    const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k);
+    BODY_20_39(32, E, T, A, B, C, D);
+    BODY_20_39(33, D, E, T, A, B, C);
+    BODY_20_39(34, C, D, E, T, A, B);
+    BODY_20_39(35, B, C, D, E, T, A);
+
+    const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k);
+    BODY_20_39(36, A, B, C, D, E, T);
+    BODY_20_39(37, T, A, B, C, D, E);
+    BODY_20_39(38, E, T, A, B, C, D);
+    BODY_20_39(39, D, E, T, A, B, C);
+
+    k = K_40_59_x_4;
+    const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k);
+    BODY_40_59(40, C, D, E, T, A, B);
+    BODY_40_59(41, B, C, D, E, T, A);
+    BODY_40_59(42, A, B, C, D, E, T);
+    BODY_40_59(43, T, A, B, C, D, E);
+
+    const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k);
+    BODY_40_59(44, E, T, A, B, C, D);
+    BODY_40_59(45, D, E, T, A, B, C);
+    BODY_40_59(46, C, D, E, T, A, B);
+    BODY_40_59(47, B, C, D, E, T, A);
+
+    const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k);
+    BODY_40_59(48, A, B, C, D, E, T);
+    BODY_40_59(49, T, A, B, C, D, E);
+    BODY_40_59(50, E, T, A, B, C, D);
+    BODY_40_59(51, D, E, T, A, B, C);
+
+    const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k);
+    BODY_40_59(52, C, D, E, T, A, B);
+    BODY_40_59(53, B, C, D, E, T, A);
+    BODY_40_59(54, A, B, C, D, E, T);
+    BODY_40_59(55, T, A, B, C, D, E);
+
+    const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k);
+    BODY_40_59(56, E, T, A, B, C, D);
+    BODY_40_59(57, D, E, T, A, B, C);
+    BODY_40_59(58, C, D, E, T, A, B);
+    BODY_40_59(59, B, C, D, E, T, A);
+
+    k = K_60_79_x_4;
+    const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k);
+    BODY_60_79(60, A, B, C, D, E, T);
+    BODY_60_79(61, T, A, B, C, D, E);
+    BODY_60_79(62, E, T, A, B, C, D);
+    BODY_60_79(63, D, E, T, A, B, C);
+
+    const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k);
+    BODY_60_79(64, C, D, E, T, A, B);
+    BODY_60_79(65, B, C, D, E, T, A);
+    BODY_60_79(66, A, B, C, D, E, T);
+    BODY_60_79(67, T, A, B, C, D, E);
+
+    const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k);
+    BODY_60_79(68, E, T, A, B, C, D);
+    BODY_60_79(69, D, E, T, A, B, C);
+    BODY_60_79(70, C, D, E, T, A, B);
+    BODY_60_79(71, B, C, D, E, T, A);
+
+    const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k);
+    BODY_60_79(72, A, B, C, D, E, T);
+    BODY_60_79(73, T, A, B, C, D, E);
+    BODY_60_79(74, E, T, A, B, C, D);
+    BODY_60_79(75, D, E, T, A, B, C);
+
+    // We don't use the last value
+    (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k);
+    BODY_60_79(76, C, D, E, T, A, B);
+    BODY_60_79(77, B, C, D, E, T, A);
+    BODY_60_79(78, A, B, C, D, E, T);
+    BODY_60_79(79, T, A, B, C, D, E);
+
+    const uint32_t mask = 0xffffffffUL;
+    state[0] = (state[0] + E) & mask;
+    state[1] = (state[1] + T) & mask;
+    state[2] = (state[2] + A) & mask;
+    state[3] = (state[3] + B) & mask;
+    state[4] = (state[4] + C) & mask;
+
+    data += 64;
+    if (--num == 0) {
+      break;
+    }
+
+    A = state[0];
+    B = state[1];
+    C = state[2];
+    D = state[3];
+    E = state[4];
+  }
+}
+
+#endif  // OPENSSL_PPC64LE
+
+#undef K_00_19
+#undef K_20_39
+#undef K_40_59
+#undef K_60_79
+#undef F_00_19
+#undef F_20_39
+#undef F_40_59
+#undef F_60_79
+#undef BODY_00_19
+#undef BODY_20_39
+#undef BODY_40_59
+#undef BODY_60_79
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha1.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha1.c
@@ -0,0 +1,390 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+#include "../digest/md32_common.h"
+#include "internal.h"
+
+
+int SHA1_Init(SHA_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
+  sha->h[0] = 0x67452301UL;
+  sha->h[1] = 0xefcdab89UL;
+  sha->h[2] = 0x98badcfeUL;
+  sha->h[3] = 0x10325476UL;
+  sha->h[4] = 0xc3d2e1f0UL;
+  return 1;
+}
+
+int SHA1_Init_from_state(SHA_CTX *sha, const uint8_t h[SHA1_CHAINING_LENGTH],
+                         uint64_t n) {
+  if (n % ((uint64_t)SHA_CBLOCK * 8) != 0) {
+    // n is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
+
+  const size_t out_words = SHA1_CHAINING_LENGTH / 4;
+  for (size_t i = 0; i < out_words; i++) {
+    sha->h[i] = CRYPTO_load_u32_be(h);
+    h += 4;
+  }
+
+  sha->Nh = n >> 32;
+  sha->Nl = n & 0xffffffff;
+
+  return 1;
+}
+
+uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA_CTX ctx;
+  const int ok = SHA1_Init(&ctx) &&
+                 SHA1_Update(&ctx, data, len) &&
+                 SHA1_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC)
+static void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
+                                  size_t num);
+#endif
+
+void SHA1_Transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) {
+  sha1_block_data_order(c->h, data, 1);
+}
+
+int SHA1_Update(SHA_CTX *c, const void *data, size_t len) {
+  crypto_md32_update(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
+                     &c->Nh, &c->Nl, data, len);
+  return 1;
+}
+
+int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) {
+  crypto_md32_final(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
+                    c->Nh, c->Nl, /*is_big_endian=*/1);
+
+  CRYPTO_store_u32_be(out, c->h[0]);
+  CRYPTO_store_u32_be(out + 4, c->h[1]);
+  CRYPTO_store_u32_be(out + 8, c->h[2]);
+  CRYPTO_store_u32_be(out + 12, c->h[3]);
+  CRYPTO_store_u32_be(out + 16, c->h[4]);
+  FIPS_service_indicator_update_state();
+  return 1;
+}
+
+int SHA1_get_state(SHA_CTX *ctx, uint8_t out_h[SHA1_CHAINING_LENGTH],
+                   uint64_t *out_n) {
+  if (ctx->Nl % ((uint64_t)SHA_CBLOCK * 8) != 0) {
+    // ctx->Nl is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  const size_t out_words = SHA1_CHAINING_LENGTH / 4;
+  for (size_t i = 0; i < out_words; i++) {
+    CRYPTO_store_u32_be(out_h, ctx->h[i]);
+    out_h += 4;
+  }
+
+  *out_n = (((uint64_t)ctx->Nh) << 32) + ctx->Nl;
+
+  return 1;
+}
+
+#define Xupdate(a, ix, ia, ib, ic, id)    \
+  do {                                    \
+    (a) = ((ia) ^ (ib) ^ (ic) ^ (id));    \
+    (ix) = (a) = CRYPTO_rotl_u32((a), 1); \
+  } while (0)
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+// As  pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
+// to the code in F_00_19.  Wei attributes these optimisations to Peter
+// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
+// F(x,y,z) (((x) & (y))  |  ((~(x)) & (z))) I've just become aware of another
+// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
+#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
+#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
+#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
+#define F_60_79(b, c, d) F_20_39(b, c, d)
+
+#define BODY_00_15(i, a, b, c, d, e, f, xi)                \
+  do {                                                     \
+    (f) = (xi) + (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + \
+          F_00_19((b), (c), (d));                          \
+    (b) = CRYPTO_rotl_u32((b), 30);                        \
+  } while (0)
+
+#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)                  \
+  do {                                                                       \
+    Xupdate(f, xi, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + F_00_19((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
+  } while (0)
+
+#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd)                  \
+  do {                                                                       \
+    Xupdate(f, xi, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
+  } while (0)
+
+#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd)                      \
+  do {                                                                       \
+    Xupdate(f, xa, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
+  } while (0)
+
+#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd)                      \
+  do {                                                                       \
+    Xupdate(f, xa, xa, xb, xc, xd);                                          \
+    (f) += (e) + K_40_59 + CRYPTO_rotl_u32((a), 5) + F_40_59((b), (c), (d)); \
+    (b) = CRYPTO_rotl_u32((b), 30);                                          \
+  } while (0)
+
+#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd)    \
+  do {                                                     \
+    Xupdate(f, xa, xa, xb, xc, xd);                        \
+    (f) = (xa) + (e) + K_60_79 + CRYPTO_rotl_u32((a), 5) + \
+          F_60_79((b), (c), (d));                          \
+    (b) = CRYPTO_rotl_u32((b), 30);                        \
+  } while (0)
+
+#ifdef X
+#undef X
+#endif
+
+/* Originally X was an array. As it's automatic it's natural
+* to expect RISC compiler to accomodate at least part of it in
+* the register bank, isn't it? Unfortunately not all compilers
+* "find" this expectation reasonable:-( On order to make such
+* compilers generate better code I replace X[] with a bunch of
+* X0, X1, etc. See the function body below...
+*         <appro@fy.chalmers.se> */
+#define X(i)  XX##i
+
+#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC)
+
+#if !defined(SHA1_ASM_NOHW)
+static void sha1_block_data_order_nohw(uint32_t state[5], const uint8_t *data,
+                                       size_t num) {
+  register uint32_t A, B, C, D, E, T;
+  uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
+      XX11, XX12, XX13, XX14, XX15;
+
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+  E = state[4];
+
+  for (;;) {
+    X(0) = CRYPTO_load_u32_be(data);
+    data += 4;
+    X(1) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(0, A, B, C, D, E, T, X(0));
+    X(2) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(1, T, A, B, C, D, E, X(1));
+    X(3) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(2, E, T, A, B, C, D, X(2));
+    X(4) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(3, D, E, T, A, B, C, X(3));
+    X(5) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(4, C, D, E, T, A, B, X(4));
+    X(6) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(5, B, C, D, E, T, A, X(5));
+    X(7) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(6, A, B, C, D, E, T, X(6));
+    X(8) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(7, T, A, B, C, D, E, X(7));
+    X(9) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(8, E, T, A, B, C, D, X(8));
+    X(10) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(9, D, E, T, A, B, C, X(9));
+    X(11) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(10, C, D, E, T, A, B, X(10));
+    X(12) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(11, B, C, D, E, T, A, X(11));
+    X(13) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(12, A, B, C, D, E, T, X(12));
+    X(14) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(13, T, A, B, C, D, E, X(13));
+    X(15) = CRYPTO_load_u32_be(data);
+    data += 4;
+    BODY_00_15(14, E, T, A, B, C, D, X(14));
+    BODY_00_15(15, D, E, T, A, B, C, X(15));
+
+    BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13));
+    BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14));
+    BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15));
+    BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0));
+
+    BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1));
+    BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2));
+    BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3));
+    BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4));
+    BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5));
+    BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6));
+    BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7));
+    BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8));
+    BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9));
+    BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10));
+    BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11));
+    BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12));
+
+    BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13));
+    BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14));
+    BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15));
+    BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0));
+    BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1));
+    BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2));
+    BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3));
+    BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4));
+
+    BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5));
+    BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6));
+    BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7));
+    BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8));
+    BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9));
+    BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10));
+    BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11));
+    BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12));
+    BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13));
+    BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14));
+    BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15));
+    BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0));
+    BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1));
+    BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2));
+    BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3));
+    BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4));
+    BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5));
+    BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6));
+    BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7));
+    BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8));
+
+    BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9));
+    BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10));
+    BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11));
+    BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12));
+    BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13));
+    BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14));
+    BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15));
+    BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0));
+    BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1));
+    BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2));
+    BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3));
+    BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4));
+    BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5));
+    BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6));
+    BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7));
+    BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8));
+    BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9));
+    BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10));
+    BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11));
+    BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12));
+
+    state[0] = (state[0] + E) & 0xffffffffL;
+    state[1] = (state[1] + T) & 0xffffffffL;
+    state[2] = (state[2] + A) & 0xffffffffL;
+    state[3] = (state[3] + B) & 0xffffffffL;
+    state[4] = (state[4] + C) & 0xffffffffL;
+
+    if (--num == 0) {
+      break;
+    }
+
+    A = state[0];
+    B = state[1];
+    C = state[2];
+    D = state[3];
+    E = state[4];
+  }
+}
+#endif  // !SHA1_ASM_NOHW
+
+static void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
+                                  size_t num) {
+#if defined(SHA1_ASM_HW)
+  if (sha1_hw_capable()) {
+    sha1_block_data_order_hw(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+  if (sha1_avx2_capable()) {
+    sha1_block_data_order_avx2(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+  if (sha1_avx_capable()) {
+    sha1_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+  if (sha1_ssse3_capable()) {
+    sha1_block_data_order_ssse3(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA1_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha1_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
+  sha1_block_data_order_nohw(state, data, num);
+}
+
+#endif  // !SHA1_ASM && !SHA1_ALTIVEC
+
+#undef Xupdate
+#undef K_00_19
+#undef K_20_39
+#undef K_40_59
+#undef K_60_79
+#undef F_00_19
+#undef F_20_39
+#undef F_40_59
+#undef F_60_79
+#undef BODY_00_15
+#undef BODY_16_19
+#undef BODY_20_31
+#undef BODY_32_39
+#undef BODY_40_59
+#undef BODY_60_79
+#undef X
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha256.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha256.c
@@ -0,0 +1,378 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "../../internal.h"
+#include "../digest/md32_common.h"
+#include "internal.h"
+
+
+int SHA224_Init(SHA256_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
+  sha->h[0] = 0xc1059ed8UL;
+  sha->h[1] = 0x367cd507UL;
+  sha->h[2] = 0x3070dd17UL;
+  sha->h[3] = 0xf70e5939UL;
+  sha->h[4] = 0xffc00b31UL;
+  sha->h[5] = 0x68581511UL;
+  sha->h[6] = 0x64f98fa7UL;
+  sha->h[7] = 0xbefa4fa4UL;
+  sha->md_len = SHA224_DIGEST_LENGTH;
+  return 1;
+}
+
+int SHA256_Init(SHA256_CTX *sha) {
+  OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
+  sha->h[0] = 0x6a09e667UL;
+  sha->h[1] = 0xbb67ae85UL;
+  sha->h[2] = 0x3c6ef372UL;
+  sha->h[3] = 0xa54ff53aUL;
+  sha->h[4] = 0x510e527fUL;
+  sha->h[5] = 0x9b05688cUL;
+  sha->h[6] = 0x1f83d9abUL;
+  sha->h[7] = 0x5be0cd19UL;
+  sha->md_len = SHA256_DIGEST_LENGTH;
+  return 1;
+}
+
+OPENSSL_STATIC_ASSERT(SHA256_CHAINING_LENGTH==SHA224_CHAINING_LENGTH,
+                      sha256_and_sha224_have_same_chaining_length)
+
+// sha256_init_from_state_impl is the implementation of
+// SHA256_Init_from_state and SHA224_Init_from_state
+// Note that the state h is always SHA256_CHAINING_LENGTH-byte long
+static int sha256_init_from_state_impl(SHA256_CTX *sha, int md_len,
+                                       const uint8_t h[SHA256_CHAINING_LENGTH],
+                                       uint64_t n) {
+  if(n % ((uint64_t) SHA256_CBLOCK * 8) != 0) {
+    // n is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
+  sha->md_len = md_len;
+
+  const size_t out_words = SHA256_CHAINING_LENGTH / 4;
+  for (size_t i = 0; i < out_words; i++) {
+    sha->h[i] = CRYPTO_load_u32_be(h);
+    h += 4;
+  }
+
+  sha->Nh = n >> 32;
+  sha->Nl = n & 0xffffffff;
+
+  return 1;
+}
+
+int SHA224_Init_from_state(SHA256_CTX *sha,
+                           const uint8_t h[SHA224_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha256_init_from_state_impl(sha, SHA224_DIGEST_LENGTH, h, n);
+}
+
+int SHA256_Init_from_state(SHA256_CTX *sha,
+                           const uint8_t h[SHA256_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha256_init_from_state_impl(sha, SHA256_DIGEST_LENGTH, h, n);
+}
+
+uint8_t *SHA224(const uint8_t *data, size_t len,
+                uint8_t out[SHA224_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA256_CTX ctx;
+  const int ok = SHA224_Init(&ctx) &&
+                 SHA224_Update(&ctx, data, len) &&
+                 SHA224_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA256(const uint8_t *data, size_t len,
+                uint8_t out[SHA256_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA256_CTX ctx;
+  const int ok = SHA256_Init(&ctx) &&
+                 SHA256_Update(&ctx, data, len) &&
+                 SHA256_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+#if !defined(SHA256_ASM)
+static void sha256_block_data_order(uint32_t state[8], const uint8_t *in,
+                                    size_t num);
+#endif
+
+void SHA256_Transform(SHA256_CTX *c, const uint8_t data[SHA256_CBLOCK]) {
+  sha256_block_data_order(c->h, data, 1);
+}
+
+int SHA256_Update(SHA256_CTX *c, const void *data, size_t len) {
+  crypto_md32_update(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK,
+                     &c->num, &c->Nh, &c->Nl, data, len);
+  return 1;
+}
+
+int SHA224_Update(SHA256_CTX *ctx, const void *data, size_t len) {
+  return SHA256_Update(ctx, data, len);
+}
+
+static int sha256_final_impl(uint8_t *out, size_t md_len, SHA256_CTX *c) {
+  crypto_md32_final(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK,
+                    &c->num, c->Nh, c->Nl, /*is_big_endian=*/1);
+  if (c->md_len != md_len) {
+    return 0;
+  }
+
+  assert(md_len % 4 == 0);
+  const size_t out_words = md_len / 4;
+  for (size_t i = 0; i < out_words; i++) {
+    CRYPTO_store_u32_be(out, c->h[i]);
+    out += 4;
+  }
+  FIPS_service_indicator_update_state();
+  return 1;
+}
+
+int SHA256_Final(uint8_t out[SHA256_DIGEST_LENGTH], SHA256_CTX *c) {
+  return sha256_final_impl(out, SHA256_DIGEST_LENGTH, c);
+}
+
+int SHA224_Final(uint8_t out[SHA224_DIGEST_LENGTH], SHA256_CTX *ctx) {
+  return sha256_final_impl(out, SHA224_DIGEST_LENGTH, ctx);
+}
+
+// sha256_get_state_impl is the implementation of
+// SHA256_get_state and SHA224_get_state
+// Note that the state out_h is always SHA256_CHAINING_LENGTH-byte long
+static int sha256_get_state_impl(SHA256_CTX *ctx,
+                                 uint8_t out_h[SHA256_CHAINING_LENGTH],
+                                 uint64_t *out_n) {
+  if (ctx->Nl % ((uint64_t)SHA256_CBLOCK * 8) != 0) {
+    // ctx->Nl is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  const size_t out_words = SHA256_CHAINING_LENGTH / 4;
+  for (size_t i = 0; i < out_words; i++) {
+    CRYPTO_store_u32_be(out_h, ctx->h[i]);
+    out_h += 4;
+  }
+
+  *out_n = (((uint64_t)ctx->Nh) << 32) + ctx->Nl;
+
+  return 1;
+}
+
+int SHA224_get_state(SHA256_CTX *ctx, uint8_t out_h[SHA224_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha256_get_state_impl(ctx, out_h, out_n);
+}
+
+int SHA256_get_state(SHA256_CTX *ctx, uint8_t out_h[SHA256_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha256_get_state_impl(ctx, out_h, out_n);
+}
+
+#if !defined(SHA256_ASM)
+
+#if !defined(SHA256_ASM_NOHW)
+static const uint32_t K256[64] = {
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
+    0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
+    0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
+    0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+    0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL,
+    0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL,
+    0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL,
+    0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+    0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL,
+    0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL,
+    0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL,
+    0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL};
+
+// See FIPS 180-4, section 4.1.2.
+#define Sigma0(x)                                       \
+  (CRYPTO_rotr_u32((x), 2) ^ CRYPTO_rotr_u32((x), 13) ^ \
+   CRYPTO_rotr_u32((x), 22))
+#define Sigma1(x)                                       \
+  (CRYPTO_rotr_u32((x), 6) ^ CRYPTO_rotr_u32((x), 11) ^ \
+   CRYPTO_rotr_u32((x), 25))
+#define sigma0(x) \
+  (CRYPTO_rotr_u32((x), 7) ^ CRYPTO_rotr_u32((x), 18) ^ ((x) >> 3))
+#define sigma1(x) \
+  (CRYPTO_rotr_u32((x), 17) ^ CRYPTO_rotr_u32((x), 19) ^ ((x) >> 10))
+
+#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#define ROUND_00_15(i, a, b, c, d, e, f, g, h)   \
+  do {                                           \
+    T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; \
+    h = Sigma0(a) + Maj(a, b, c);                \
+    d += T1;                                     \
+    h += T1;                                     \
+  } while (0)
+
+#define ROUND_16_63(i, a, b, c, d, e, f, g, h, X)      \
+  do {                                                 \
+    s0 = X[(i + 1) & 0x0f];                            \
+    s0 = sigma0(s0);                                   \
+    s1 = X[(i + 14) & 0x0f];                           \
+    s1 = sigma1(s1);                                   \
+    T1 = X[(i) & 0x0f] += s0 + s1 + X[(i + 9) & 0x0f]; \
+    ROUND_00_15(i, a, b, c, d, e, f, g, h);            \
+  } while (0)
+
+static void sha256_block_data_order_nohw(uint32_t state[8], const uint8_t *data,
+                                         size_t num) {
+  uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
+  uint32_t X[16];
+  int i;
+
+  while (num--) {
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
+
+    T1 = X[0] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(0, a, b, c, d, e, f, g, h);
+    T1 = X[1] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(1, h, a, b, c, d, e, f, g);
+    T1 = X[2] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(2, g, h, a, b, c, d, e, f);
+    T1 = X[3] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(3, f, g, h, a, b, c, d, e);
+    T1 = X[4] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(4, e, f, g, h, a, b, c, d);
+    T1 = X[5] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(5, d, e, f, g, h, a, b, c);
+    T1 = X[6] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(6, c, d, e, f, g, h, a, b);
+    T1 = X[7] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(7, b, c, d, e, f, g, h, a);
+    T1 = X[8] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(8, a, b, c, d, e, f, g, h);
+    T1 = X[9] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(9, h, a, b, c, d, e, f, g);
+    T1 = X[10] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(10, g, h, a, b, c, d, e, f);
+    T1 = X[11] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(11, f, g, h, a, b, c, d, e);
+    T1 = X[12] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(12, e, f, g, h, a, b, c, d);
+    T1 = X[13] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(13, d, e, f, g, h, a, b, c);
+    T1 = X[14] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(14, c, d, e, f, g, h, a, b);
+    T1 = X[15] = CRYPTO_load_u32_be(data);
+    data += 4;
+    ROUND_00_15(15, b, c, d, e, f, g, h, a);
+
+    for (i = 16; i < 64; i += 8) {
+      ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X);
+      ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X);
+      ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X);
+      ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X);
+      ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X);
+      ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X);
+      ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X);
+      ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X);
+    }
+
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
+  }
+}
+
+#endif  // !defined(SHA256_ASM_NOHW)
+
+static void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
+                                    size_t num) {
+#if defined(SHA256_ASM_HW)
+  if (sha256_hw_capable()) {
+    sha256_block_data_order_hw(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+  if (sha256_avx_capable()) {
+    sha256_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+  if (sha256_ssse3_capable()) {
+    sha256_block_data_order_ssse3(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA256_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha256_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
+  sha256_block_data_order_nohw(state, data, num);
+}
+
+#endif  // !defined(SHA256_ASM)
+
+
+void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data,
+                            size_t num_blocks) {
+  sha256_block_data_order(state, data, num_blocks);
+}
+
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+#undef ROUND_00_15
+#undef ROUND_16_63
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha3.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha3.c
@@ -0,0 +1,520 @@
+ // Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "internal.h"
+#include <string.h>
+
+uint8_t *SHA3_224(const uint8_t *data, size_t len,
+                  uint8_t out[SHA3_224_DIGEST_LENGTH]) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHA3_Init(&ctx, SHA3_224_DIGEST_BITLENGTH) &&
+            SHA3_Update(&ctx, data, len) &&
+            SHA3_Final(out, &ctx));
+
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+uint8_t *SHA3_256(const uint8_t *data, size_t len,
+                  uint8_t out[SHA3_256_DIGEST_LENGTH]) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHA3_Init(&ctx, SHA3_256_DIGEST_BITLENGTH) &&
+            SHA3_Update(&ctx, data, len) &&
+            SHA3_Final(out, &ctx));
+
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+uint8_t *SHA3_384(const uint8_t *data, size_t len,
+                  uint8_t out[SHA3_384_DIGEST_LENGTH]) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHA3_Init(&ctx, SHA3_384_DIGEST_BITLENGTH) &&
+            SHA3_Update(&ctx, data, len) &&
+            SHA3_Final(out, &ctx));
+
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+uint8_t *SHA3_512(const uint8_t *data, size_t len,
+                  uint8_t out[SHA3_512_DIGEST_LENGTH]) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHA3_Init(&ctx, SHA3_512_DIGEST_BITLENGTH) &&
+            SHA3_Update(&ctx, data, len) &&
+            SHA3_Final(out, &ctx));
+
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+uint8_t *SHAKE128(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHAKE_Init(&ctx, SHAKE128_BLOCKSIZE) &&
+            SHAKE_Absorb(&ctx, data, in_len) &&
+            SHAKE_Final(out, &ctx, out_len));
+
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+uint8_t *SHAKE256(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len) {
+  FIPS_service_indicator_lock_state();
+  KECCAK1600_CTX ctx;
+  int ok = (SHAKE_Init(&ctx, SHAKE256_BLOCKSIZE) &&
+            SHAKE_Absorb(&ctx, data, in_len) &&
+            SHAKE_Final(out, &ctx, out_len));
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  FIPS_service_indicator_unlock_state();
+  if (ok == 0) {
+    return NULL;
+  }
+  FIPS_service_indicator_update_state();
+  return out;
+}
+
+/*
+ * FIPS202 APIs manage internal input/output buffer on top of Keccak1600 API layer
+ */
+// FIPS202_Reset zero's |ctx| fields.
+static void FIPS202_Reset(KECCAK1600_CTX *ctx) {
+  OPENSSL_memset(ctx->A, 0, sizeof(ctx->A));
+  ctx->buf_load = 0;
+  ctx->state = KECCAK1600_STATE_ABSORB;
+}
+
+// FIPS202_Init checks the correctness of the padding character and size of
+// the internal buffer. It initialises the |ctx| fields and returns 1 on
+// success and 0 on failure.
+static int FIPS202_Init(KECCAK1600_CTX *ctx, uint8_t pad, size_t block_size, size_t bit_len) {
+  if (pad != SHA3_PAD_CHAR && 
+      pad != SHAKE_PAD_CHAR) { 
+    return 0;
+  }
+      
+  if (block_size <= sizeof(ctx->buf)) {
+      FIPS202_Reset(ctx);
+      ctx->block_size = block_size;
+      ctx->md_size = bit_len / 8;
+      ctx->pad = pad;
+      return 1;
+    }
+    return 0;
+}
+
+// FIPS202_Update checks the state of the |ctx| and processes intermediate buffer from
+// previous calls. It processes |data| in blocks through |Keccak1600_Absorb| and places
+// the rest in the intermediate buffer. FIPS202_Update fails if called from inappropriate
+// |ctx->state| or on |Keccak1600_Absorb| error. Otherwise, it returns 1.
+static int FIPS202_Update(KECCAK1600_CTX *ctx, const void *data, size_t len) {
+  uint8_t *data_ptr_copy = (uint8_t *) data;
+  size_t block_size = ctx->block_size;
+  size_t num, rem;
+
+  if (ctx->state == KECCAK1600_STATE_SQUEEZE ||
+      ctx->state == KECCAK1600_STATE_FINAL ) {
+    return 0;
+  }
+
+  // Case |len| equals 0 is checked in SHA3/SHAKE higher level APIs
+  // Process intermediate buffer.
+  num = ctx->buf_load;
+  if (num != 0) {
+    rem = block_size - num;
+    if (len < rem) {
+      OPENSSL_memcpy(ctx->buf + num, data_ptr_copy, len);
+      ctx->buf_load += len;
+      return 1;
+    }
+
+    // There is enough data to fill or overflow the intermediate
+    // buffer. So we append |rem| bytes and process the block,
+    // leaving the rest for later processing.
+    OPENSSL_memcpy(ctx->buf + num, data_ptr_copy, rem);
+    data_ptr_copy += rem, len -= rem;
+    if (Keccak1600_Absorb(ctx->A, ctx->buf, block_size, block_size) != 0 ) {
+      return 0;
+    }
+    ctx->buf_load = 0;
+    // ctx->buf is processed, ctx->buf_load is guaranteed to be zero
+  }
+
+  if (len >= block_size) {
+    rem = Keccak1600_Absorb(ctx->A, data_ptr_copy, len, block_size);
+  }
+  else {
+    rem = len;
+  }
+
+  if (rem != 0) {
+    OPENSSL_memcpy(ctx->buf, data_ptr_copy + len - rem, rem);
+    ctx->buf_load = rem;
+  }
+
+  return 1;
+}
+
+// FIPS202_Finalize processes padding and absorb of last input block
+// This function should be called once to finalize absorb and initiate
+// squeeze phase. FIPS202_Finalize fails if called from inappropriate
+// |ctx->state| or on |Keccak1600_Absorb| error. Otherwise, it returns 1.
+static int FIPS202_Finalize(uint8_t *md, KECCAK1600_CTX *ctx) {
+  size_t block_size = ctx->block_size;
+  size_t num = ctx->buf_load;
+
+  if (ctx->state == KECCAK1600_STATE_SQUEEZE || 
+      ctx->state == KECCAK1600_STATE_FINAL ) {
+    return 0;
+  }
+
+  // Pad the data with 10*1. Note that |num| can be |block_size - 1|
+  // in which case both byte operations below are performed on
+  // the same byte.
+  OPENSSL_memset(ctx->buf + num, 0, block_size - num);
+  ctx->buf[num] = ctx->pad;
+  ctx->buf[block_size - 1] |= 0x80;
+
+  if (Keccak1600_Absorb(ctx->A, ctx->buf, block_size, block_size) != 0) {
+    return 0;
+  }
+  
+  // ctx->buf is processed, ctx->buf_load is guaranteed to be zero
+  ctx->buf_load = 0;
+
+  return 1;
+}
+
+/*
+ * SHA3 APIs implement SHA3 functionalities on top of FIPS202 API layer
+ */
+int SHA3_Init(KECCAK1600_CTX *ctx, size_t bit_len) {
+  if (ctx == NULL) {
+    return 0;
+  }
+
+  if (bit_len != SHA3_224_DIGEST_BITLENGTH && 
+      bit_len != SHA3_256_DIGEST_BITLENGTH && 
+      bit_len != SHA3_384_DIGEST_BITLENGTH && 
+      bit_len != SHA3_512_DIGEST_BITLENGTH) {
+        return 0;
+  }
+  // |block_size| depends on the SHA3 |bit_len| output (digest) length
+  return FIPS202_Init(ctx, SHA3_PAD_CHAR, SHA3_BLOCKSIZE(bit_len), bit_len);
+}
+
+int SHA3_Update(KECCAK1600_CTX *ctx, const void *data, size_t len) {
+  if (ctx == NULL) {
+    return 0;
+  }
+
+  if (data == NULL && len != 0) {
+    return 0;
+  }
+
+  if (len == 0) {
+    return 1;
+  }
+
+  return FIPS202_Update(ctx, data, len);
+}
+
+// SHA3_Final should be called once to process final digest value
+int SHA3_Final(uint8_t *md, KECCAK1600_CTX *ctx) {
+  if (md == NULL || ctx == NULL) {
+    return 0;
+  }
+
+  if (ctx->md_size == 0) {
+    return 1;
+  }
+
+  if (FIPS202_Finalize(md, ctx) == 0) {
+    return 0;
+  }
+
+  Keccak1600_Squeeze(ctx->A, md, ctx->md_size, ctx->block_size, ctx->state);
+  ctx->state = KECCAK1600_STATE_FINAL;
+
+  FIPS_service_indicator_update_state();
+  return 1;
+}
+
+int SHA3_224_Init(KECCAK1600_CTX *ctx) {
+    return SHA3_Init(ctx, SHA3_224_DIGEST_BITLENGTH);
+}
+
+int SHA3_224_Update(KECCAK1600_CTX *ctx, const void *data,
+                                     size_t len) {
+    return SHA3_Update(ctx, data, len);
+}
+
+int SHA3_224_Final(uint8_t out[SHA3_224_DIGEST_LENGTH],
+                                    KECCAK1600_CTX *ctx) {
+    return SHA3_Final(&out[0], ctx);
+}
+
+int SHA3_256_Init(KECCAK1600_CTX *ctx) {
+    return SHA3_Init(ctx, SHA3_256_DIGEST_BITLENGTH);
+}
+
+int SHA3_256_Update(KECCAK1600_CTX *ctx, const void *data,
+                                     size_t len) {
+    return SHA3_Update(ctx, data, len);
+}
+
+int SHA3_256_Final(uint8_t out[SHA3_256_DIGEST_LENGTH],
+                                    KECCAK1600_CTX *ctx) {
+    return SHA3_Final(&out[0], ctx);
+}
+
+int SHA3_384_Init(KECCAK1600_CTX *ctx) {
+    return SHA3_Init(ctx, SHA3_384_DIGEST_BITLENGTH);
+}
+
+int SHA3_384_Update(KECCAK1600_CTX *ctx, const void *data,
+                                     size_t len) {
+    return SHA3_Update(ctx, data, len);
+}
+
+int SHA3_384_Final(uint8_t out[SHA3_384_DIGEST_LENGTH],
+                                    KECCAK1600_CTX *ctx) {
+    return SHA3_Final(&out[0], ctx);
+}
+
+int SHA3_512_Init(KECCAK1600_CTX *ctx) {
+    return SHA3_Init(ctx, SHA3_512_DIGEST_BITLENGTH);
+}
+
+int SHA3_512_Update(KECCAK1600_CTX *ctx, const void *data,
+                                     size_t len) {
+    return SHA3_Update(ctx, data, len);
+}
+
+int SHA3_512_Final(uint8_t out[SHA3_512_DIGEST_LENGTH],
+                                    KECCAK1600_CTX *ctx) {
+    return SHA3_Final(&out[0], ctx);
+}
+
+/*
+ * SHAKE APIs implement SHAKE functionalities on top of FIPS202 API layer
+ */
+int SHAKE_Init(KECCAK1600_CTX *ctx, size_t block_size) {
+  if (ctx == NULL) {
+    return 0;
+  }
+
+  if (block_size != SHAKE128_BLOCKSIZE &&
+      block_size != SHAKE256_BLOCKSIZE) {
+        return 0;
+  }
+  // |block_size| depends on the SHAKE security level
+  // The output length |bit_len| is initialized to 0
+  return FIPS202_Init(ctx, SHAKE_PAD_CHAR, block_size, 0);
+}
+
+int SHAKE_Absorb(KECCAK1600_CTX *ctx, const void *data, size_t len) {
+  if (ctx == NULL) {
+    return 0;
+  }
+
+  if (data == NULL && len != 0) {
+    return 0;
+  }
+
+  if (len == 0) {
+    return 1;
+  }
+
+  return FIPS202_Update(ctx, data, len);
+}
+
+// SHAKE_Final is to be called once to finalize absorb and squeeze phases
+// |ctx->state| restricts consecutive calls to |FIPS202_Finalize|.
+// Function |SHAKE_Squeeze| should be used for incremental XOF output.
+int SHAKE_Final(uint8_t *md, KECCAK1600_CTX *ctx, size_t len) {
+  if (ctx == NULL || md == NULL) {
+    return 0;
+  }
+
+  ctx->md_size = len;
+  if (ctx->md_size == 0) {
+    return 1;
+  }
+
+  if (FIPS202_Finalize(md, ctx) == 0) {
+    return 0;
+  }
+
+  Keccak1600_Squeeze(ctx->A, md, ctx->md_size, ctx->block_size, ctx->state);
+  ctx->state = KECCAK1600_STATE_FINAL;
+
+  FIPS_service_indicator_update_state();
+  return 1;
+}
+
+// SHAKE_Squeeze can be called multiple time for incremental XOF output
+int SHAKE_Squeeze(uint8_t *md, KECCAK1600_CTX *ctx, size_t len) {
+  size_t block_bytes;
+
+  if (ctx == NULL || md == NULL) {
+    return 0;
+  }
+
+  ctx->md_size = len;
+
+  if (ctx->md_size == 0) {
+    return 1;
+  }
+
+  if (ctx->state == KECCAK1600_STATE_FINAL) {
+    return 0;
+  }
+
+  // Skip FIPS202_Finalize if the input has been padded and
+  // the last block has been processed
+  if (ctx->state == KECCAK1600_STATE_ABSORB) {
+    if (FIPS202_Finalize(md, ctx) == 0) {
+      return 0;
+    }
+  }
+  // Process previous data from output buffer if any
+  if (ctx->buf_load != 0) {
+    if (len <= ctx->buf_load) {
+      OPENSSL_memcpy(md, ctx->buf + ctx->block_size - ctx->buf_load, len);
+      ctx->buf_load -= len;
+      return 1;
+    } else {
+      OPENSSL_memcpy(md, ctx->buf + ctx->block_size - ctx->buf_load, ctx->buf_load);
+      md += ctx->buf_load;
+      len -= ctx->buf_load;
+      ctx->buf_load = 0;
+    }
+  }
+
+  // Process all full size output requested blocks
+  if (len > ctx->block_size) {
+    block_bytes = ctx->block_size * (len / ctx->block_size);
+    Keccak1600_Squeeze(ctx->A, md, block_bytes, ctx->block_size, ctx->state);
+    md += block_bytes;
+    len -= block_bytes;
+    ctx->state = KECCAK1600_STATE_SQUEEZE;
+  }
+
+  if (len > 0) {
+    // Process an additional block if output length is not a multiple of block size.
+    // Generated output is store in |ctx->buf|. Only requested bytes are transfered
+    // to the output. The 'unused' output data is kept for processing in a sequenctual
+    // call to SHAKE_Squeeze (incremental byte-wise SHAKE_Squeeze)
+    Keccak1600_Squeeze(ctx->A, ctx->buf, ctx->block_size, ctx->block_size, ctx->state);
+    OPENSSL_memcpy(md, ctx->buf, len);
+    ctx->buf_load = ctx->block_size - len; // how much there is still in buffer to be consumed
+    ctx->state = KECCAK1600_STATE_SQUEEZE;
+  }
+
+  //FIPS_service_indicator_update_state();
+  return 1;
+}
+
+/*
+ * SHAKE batched (x4) APIs implement SHAKE functionalities in batches of four on top of SHAKE API layer
+ */
+int SHAKE128_Init_x4(KECCAK1600_CTX_x4 *ctx) {
+  OPENSSL_memset(ctx, 0, sizeof(KECCAK1600_CTX_x4));
+  return 1;
+}
+
+int SHAKE128_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
+                                  const void *data2, const void *data3, size_t len) {
+  Keccak1600_Absorb_once_x4(ctx->A, data0, data1, data2, data3, len,
+                            SHAKE128_BLOCKSIZE, SHAKE_PAD_CHAR);
+  return 1;
+}
+
+int SHAKE128_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
+                                  KECCAK1600_CTX_x4 *ctx, size_t blks) {
+  Keccak1600_Squeezeblocks_x4(ctx->A, md0, md1, md2, md3, blks, SHAKE128_BLOCKSIZE);
+  return 1;
+}
+
+static int SHAKE256_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
+                                  const void *data2, const void *data3, size_t len) {
+  Keccak1600_Absorb_once_x4(ctx->A, data0, data1, data2, data3,
+                            len, SHAKE256_BLOCKSIZE, SHAKE_PAD_CHAR);
+  return 1;
+}
+
+static int SHAKE256_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
+                                  KECCAK1600_CTX_x4 *ctx, size_t blks) {
+  Keccak1600_Squeezeblocks_x4(ctx->A, md0, md1, md2, md3, blks, SHAKE256_BLOCKSIZE);
+  return 1;
+}
+
+int SHAKE256_x4(const uint8_t *data0, const uint8_t *data1, const uint8_t *data2,
+                                  const uint8_t *data3, const size_t in_len,
+                                  uint8_t *out0, uint8_t *out1, uint8_t *out2,
+                                  uint8_t *out3, size_t out_len) {
+  KECCAK1600_CTX_x4 ctx;
+  OPENSSL_memset(&ctx, 0, sizeof(ctx));
+  size_t nblocks = out_len / SHAKE256_BLOCKSIZE;
+  uint8_t tmp0[SHAKE256_BLOCKSIZE];
+  uint8_t tmp1[SHAKE256_BLOCKSIZE];
+  uint8_t tmp2[SHAKE256_BLOCKSIZE];
+  uint8_t tmp3[SHAKE256_BLOCKSIZE];
+
+  SHAKE256_Absorb_once_x4(&ctx, data0, data1, data2, data3, in_len);
+  SHAKE256_Squeezeblocks_x4(out0, out1, out2, out3, &ctx, nblocks);
+
+  out0 += nblocks * SHAKE256_BLOCKSIZE;
+  out1 += nblocks * SHAKE256_BLOCKSIZE;
+  out2 += nblocks * SHAKE256_BLOCKSIZE;
+  out3 += nblocks * SHAKE256_BLOCKSIZE;
+
+  out_len -= nblocks * SHAKE256_BLOCKSIZE;
+
+  if (out_len > 0)
+  {
+      SHAKE256_Squeezeblocks_x4(tmp0, tmp1, tmp2, tmp3, &ctx, 1);
+      OPENSSL_memcpy(out0, tmp0, out_len);
+      OPENSSL_memcpy(out1, tmp1, out_len);
+      OPENSSL_memcpy(out2, tmp2, out_len);
+      OPENSSL_memcpy(out3, tmp3, out_len);
+  }
+
+  OPENSSL_cleanse(tmp0, sizeof(tmp0));
+  OPENSSL_cleanse(tmp1, sizeof(tmp1));
+  OPENSSL_cleanse(tmp2, sizeof(tmp2));
+  OPENSSL_cleanse(tmp3, sizeof(tmp3));
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+
+  return 1;
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha3_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha3_test.cc
@@ -0,0 +1,585 @@
+// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0 OR ISC
+
+#include <openssl/evp.h>
+#include <openssl/rand.h>
+#include <openssl/sha.h>
+#include <gtest/gtest.h>
+
+#include <openssl/digest.h>
+#include "../../test/file_test.h"
+#include "../../test/test_util.h"
+#include "internal.h"
+
+// Set values for input/output lengths used in
+// |NISTTestVectors_SHAKESqueeze| test function
+#define RAND_BYTES              256
+#define RAND_OUT_BYTES          256
+
+#define RAND_BYTES_x4           34
+#define RAND_OUT_BLCKS          6
+
+#define BATCHED_x4              4
+#define NUM_TESTS               10
+// Table containing the length of the output to squeeze for the
+// initial call, followed by a output length for each subsequent call.
+static const struct {
+    size_t startsz, incsz;
+} stride_tests[] = {
+    // Test Edge Cases for SHAKE128 with blocksize of 168B
+    { 1, 1 },
+    { 8, 8 },
+    { 9, 9 },
+    { 10, 10 },
+    { 1, 168 },
+    { 1, 168/2 },
+    { 1, 168/2-1 },
+    { 1, 168/2+1 },
+    { 1, 168*3 },
+    { 168/2 - 1, 168 },
+    { 168/2 - 1, 168-1 },
+    { 168/2 - 1, 168+1 },
+    { 168/2, 168 },
+    { 168/2, 168-1 },
+    { 168/2, 168+1 },
+    { 168/2 + 1, 168 },
+    { 168/2 + 1, 168-1 },
+    { 168/2 + 1, 168+1 },
+    { 168, 2 },
+    { 168, 168 },
+    { 168-1, 168 },
+    { 168-1, 168-1 },
+    { 168-1, 168+1 },
+    { 168+1, 168 },
+    { 168+1, 168-1 },
+    { 168+1, 168+1 },
+    { 168*3, 168 },
+    { 168*3, 168 + 1 },
+    { 168*3, 168 - 1 },
+    { 168*3, 168/2 },
+    { 168*3, 168/2 + 1 },
+    { 168*3, 168/2 - 1 },
+    // Test Edge Cases for SHAKE256 with blocksize of 136B
+    { 1, 136 },
+    { 1, 136/2 },
+    { 1, 136/2-1 },
+    { 1, 136/2+1 },
+    { 1, 136*3 },
+    { 8, 8 },
+    { 9, 9 },
+    { 10, 10 },
+    { 136/2 - 1, 136 },
+    { 136/2 - 1, 136-1 },
+    { 136/2 - 1, 136+1 },
+    { 136/2, 136 },
+    { 136/2, 136-1 },
+    { 136/2, 136+1 },
+    { 136/2 + 1, 136 },
+    { 136/2 + 1, 136-1 },
+    { 136/2 + 1, 136+1 },
+    { 136, 2 },
+    { 136, 136 },
+    { 136-1, 136 },
+    { 136-1, 136-1 },
+    { 136-1, 136+1 },
+    { 136+1, 136 },
+    { 136+1, 136-1 },
+    { 136+1, 136+1 },
+    { 136*3, 136 },
+    { 136*3, 136 + 1 },
+    { 136*3, 136 - 1 },
+    { 136*3, 136/2 },
+    { 136*3, 136/2 + 1 },
+    { 136*3, 136/2 - 1 }
+};
+
+// SHA3TestVector corresponds to one test case of the NIST published file
+// SHA3_256ShortMsg.txt.
+// https://csrc.nist.gov/projects/cryptographic-algorithm-validation-program/secure-hashing
+class SHA3TestVector {
+ public:
+  explicit SHA3TestVector() = default;
+  ~SHA3TestVector() = default;
+
+  bool ReadFromFileTest(FileTest *t);
+
+  void NISTTestVectors(const EVP_MD *algorithm) const {
+    uint32_t digest_length;
+    std::unique_ptr<uint8_t[]> digest(new uint8_t[EVP_MD_size(algorithm)]);
+    bssl::ScopedEVP_MD_CTX ctx;
+
+    // Test the correctness via the Init, Update and Final Digest APIs.
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), len_ / 8));
+    ASSERT_TRUE(EVP_DigestFinal(ctx.get(), digest.get(), &digest_length));
+
+    ASSERT_EQ(Bytes(digest.get(), EVP_MD_size(algorithm)),
+              Bytes(digest_.data(), EVP_MD_size(algorithm)));
+
+    // Test XOF-specific Digest functions with non XOF algorithms
+    // Assert failure when |EVP_DigestSqueeze| or |EVP_DigestFinalXOF|
+    // are called with digests different from XOF digests
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+    ASSERT_FALSE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
+    ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+  }
+
+  void NISTTestVectors_SingleShot(const EVP_MD *algorithm) const {
+    uint32_t digest_length;
+    std::unique_ptr<uint8_t[]> digest(new uint8_t[EVP_MD_size(algorithm)]);
+
+    // Test the correctness via the Single-Shot EVP_Digest APIs.
+    ASSERT_TRUE(EVP_Digest(msg_.data(), len_ / 8, digest.get(), &digest_length,
+                           algorithm, nullptr));
+
+    ASSERT_EQ(Bytes(digest.get(), EVP_MD_size(algorithm)),
+              Bytes(digest_.data(), EVP_MD_size(algorithm)));
+  }
+
+  void NISTTestVectors_SHAKE(const EVP_MD *algorithm) const {
+    uint32_t digest_length = out_len_ / 8;
+    std::unique_ptr<uint8_t[]> digest(new uint8_t[digest_length]);
+    bssl::ScopedEVP_MD_CTX ctx;
+
+    // Test the incremental EVP API
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    // Test the one-shot
+    ASSERT_TRUE(EVP_Digest(msg_.data(), msg_.size(), digest.get(),
+                           &digest_length, algorithm, nullptr));
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+  }
+
+  // Test SHAKE Squeeze functionality through |EVP_Digest| APIs
+  void NISTTestVectors_SHAKESqueeze(const EVP_MD *algorithm) const {
+    uint8_t random_bytes[RAND_BYTES];
+    size_t sqd_bytes = 0, cur_test = 0, to_sq_bytes = 0;
+
+    uint32_t digest_length = out_len_ / 8;
+    std::unique_ptr<uint8_t[]> digest(new uint8_t[digest_length]);
+    std::unique_ptr<uint8_t[]> digest_stream(new uint8_t[RAND_OUT_BYTES]);
+    std::unique_ptr<uint8_t[]> digest_signle_shot(new uint8_t[RAND_OUT_BYTES]);
+
+    bssl::ScopedEVP_MD_CTX ctx;
+
+    // Test Final XOF
+    // Assert fail when |EVP_DigestFinalXOF| is called as a streaming API
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+    ASSERT_FALSE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
+
+    // Test the one-shot
+    // Assert success when |EVP_Digest| is called
+    OPENSSL_memset(digest.get(), 0, digest_length);
+    ASSERT_TRUE(EVP_Digest(msg_.data(), msg_.size(), digest.get(),
+                           &digest_length, algorithm, nullptr));
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    // Test Final
+    // Assert fail when |EVP_DigestFinal| is called for XOF algorithms
+    OPENSSL_memset(digest.get(), 0, digest_length);
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+
+    ASSERT_FALSE(EVP_DigestFinal(ctx.get(), digest.get(), &digest_length));
+    ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+
+    // Test Final XOF after Squeeze
+    // Assert fail when |EVP_DigestFinalXOF| is called after |EVP_DigestSqueeze|
+    ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+    ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length/2));
+    ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get() + digest_length/2,
+                                                      digest_length/2));
+
+    // Test Update after Squeeze
+    // Assert fail when |EVP_DigestUpdate| is called after |EVP_DigestSqueeze|
+    ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+    ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
+    ASSERT_FALSE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+
+    // Test Absorb
+    // Assert success when |EVP_DigestUpdate| is called byte-by-byte
+    OPENSSL_memset(digest.get(), 0, digest_length);
+    ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), nullptr, 0));
+    for (const char p : msg_) {
+        ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), &p, 1));
+    }
+
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    // Test Squeeze
+    // Assert success when |EVP_DigestSqueeze| is called byte-by-byte
+    OPENSSL_memset(digest.get(), 0, digest_length);
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
+
+    for (size_t i = 0; i < digest_length; i++) {
+      ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + i, 1));
+    }
+
+    EXPECT_EQ(Bytes(digest.get(), digest_length),
+              Bytes(digest_.data(), digest_length));
+
+    // Test Squeeze
+    // Assert success when |EVP_DigestSqueeze| is called in set byte increments
+    for (cur_test = 0, sqd_bytes = 0; cur_test < (int) (sizeof(stride_tests)/sizeof(stride_tests[0])); cur_test++, sqd_bytes = 0) {
+      to_sq_bytes = stride_tests[cur_test].startsz;
+      OPENSSL_memset(digest.get(), 0, digest_length);
+      ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+      ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(),  msg_.size()));
+
+        while (sqd_bytes < digest_length) {
+          if ((sqd_bytes + to_sq_bytes) > digest_length) {
+            to_sq_bytes = digest_length - sqd_bytes;
+          }
+          ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, to_sq_bytes));
+          sqd_bytes += to_sq_bytes;
+          to_sq_bytes = stride_tests[cur_test].incsz;
+        }
+      EXPECT_EQ(Bytes(digest.get(), digest_length),
+            Bytes(digest_.data(), digest_length));
+    }
+
+    // Test Squeeze Exhaustive
+    // Assert success when |EVP_DigestSqueeze| is called in all possible byte increments
+    for (to_sq_bytes = 1; to_sq_bytes < digest_length; to_sq_bytes++) {
+      OPENSSL_memset(digest.get(), 0, digest_length);
+      ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+      ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(),  msg_.size()));
+
+      for (sqd_bytes = 0; sqd_bytes <= digest_length - to_sq_bytes; sqd_bytes+=to_sq_bytes) {
+        ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, to_sq_bytes));
+      }
+
+      if ((digest_length - sqd_bytes) > 0) {
+        ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, digest_length - sqd_bytes));
+      }
+
+      EXPECT_EQ(Bytes(digest.get(), digest_length),
+            Bytes(digest_.data(), digest_length));
+    }
+    
+    // Test Squeeze with random Input
+    // Assert success when |EVP_DigestSqueeze| is called on a random message
+    ASSERT_TRUE(RAND_bytes(random_bytes, RAND_BYTES));
+
+    ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
+
+    for (size_t i = 0; i < RAND_OUT_BYTES; i++) {
+      ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + i, 1));
+    }
+
+    ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+    ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
+
+    EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
+                EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
+    
+    // Test Squeeze with random Input
+    // Assert success when |EVP_DigestSqueeze| is called on a random message
+    // in set byte increments
+    for (cur_test = 0, sqd_bytes = 0; cur_test < (int) (sizeof(stride_tests)/sizeof(stride_tests[0])); cur_test++, sqd_bytes = 0) {
+      to_sq_bytes = stride_tests[cur_test].startsz;
+      OPENSSL_memset(digest_stream.get(), 0, RAND_OUT_BYTES);
+      OPENSSL_memset(digest_signle_shot.get(), 0, RAND_OUT_BYTES);
+
+      ASSERT_TRUE(RAND_bytes(random_bytes, RAND_BYTES));
+
+      // Incremental Squeezes
+      ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+      ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes,  RAND_BYTES));
+
+      while (sqd_bytes < RAND_OUT_BYTES) {
+          if ((sqd_bytes + to_sq_bytes) > RAND_OUT_BYTES) {
+              to_sq_bytes = RAND_OUT_BYTES - sqd_bytes;
+          }
+          ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + sqd_bytes, to_sq_bytes));
+          sqd_bytes += to_sq_bytes;
+          to_sq_bytes = stride_tests[cur_test].incsz;
+      }
+
+      // Single-Shot Squeeze
+      ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
+      ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
+      ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
+
+      EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
+              EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
+    }
+
+    // Test Final XOF without Update
+    // Assert fail when |EVP_DigestFinalXOF| is called as a streaming API
+    OPENSSL_memset(digest_signle_shot.get(), 0, RAND_OUT_BYTES);
+    OPENSSL_memset(digest_stream.get(), 0, RAND_OUT_BYTES);
+
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
+
+    ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
+    ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get(), RAND_OUT_BYTES/2));
+    ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + RAND_OUT_BYTES/2,
+                                                      RAND_OUT_BYTES/2));
+
+    EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
+            EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
+}
+
+ private:
+  uint32_t len_;
+  uint32_t out_len_;
+  std::vector<uint8_t> msg_;
+  std::vector<uint8_t> digest_;
+};
+
+// Read the |key| attribute from |file_test| and convert it to an integer.
+template <typename T>
+bool FileTestReadInt(FileTest *file_test, T *out, const std::string &key) {
+  std::string s;
+  return file_test->GetAttribute(&s, key) &&
+         testing::internal::ParseInt32(
+             testing::Message() << "The value " << s.data()
+                                << " is not convertable to an integer.",
+             s.data(), (int *)out);
+}
+
+bool SHA3TestVector::ReadFromFileTest(FileTest *t) {
+  if (t->HasAttribute("Outputlen")) {
+    if (!FileTestReadInt(t, &out_len_, "Outputlen")) {
+      return false;
+    }
+  }
+
+  if (t->HasAttribute("Len")) {
+    if (!FileTestReadInt(t, &len_, "Len")) {
+      return false;
+    }
+  }
+
+  if (!t->GetBytes(&msg_, "Msg") || !t->GetBytes(&digest_, "MD")) {
+    return false;
+  }
+
+  return true;
+}
+
+TEST(SHA3Test, NISTTestVectors) {
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_224();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_256();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_384();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_512();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224LongMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_224();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256LongMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_256();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384LongMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_384();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512LongMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_512();
+                  test_vec.NISTTestVectors(algorithm);
+                });
+}
+
+TEST(SHA3Test, NISTTestVectors_SingleShot) {
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_224();
+                  test_vec.NISTTestVectors_SingleShot(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_256();
+                  test_vec.NISTTestVectors_SingleShot(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_384();
+                  test_vec.NISTTestVectors_SingleShot(algorithm);
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512ShortMsg.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  const EVP_MD *algorithm = EVP_sha3_512();
+                  test_vec.NISTTestVectors_SingleShot(algorithm);
+                });
+}
+
+TEST(KeccakInternalTest, SqueezeOutputBufferOverflow) {
+  EVP_MD_unstable_sha3_enable(true);
+
+  KECCAK1600_CTX ctx;
+  std::vector<uint8_t> out;
+  std::vector<uint8_t> canary(8);
+  std::fill(canary.begin(), canary.end(), 0xff);
+
+  const size_t out_lens[] = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, (1 << 5), (1 << 16) + 1};
+  for (auto out_len : out_lens) {
+    EXPECT_TRUE(SHA3_Init(&ctx, SHA3_384_DIGEST_BITLENGTH));
+    out.resize(out_len + canary.size());
+    std::copy(canary.begin(), canary.end(), out.end() - canary.size());
+    Keccak1600_Squeeze(ctx.A, out.data(), out_len, ctx.block_size, 1);
+    EXPECT_TRUE(std::equal(out.end() - canary.size(), out.end(),
+                           canary.begin()) == true);
+  }
+
+  EVP_MD_unstable_sha3_enable(false);
+}
+
+// Test x4 batched SHAKE against 4 consecutive SHAKE calls
+// Assert success when digest and digest_x4 values are equal
+TEST(SHAKETest_x4, RandomMessages) {
+  KECCAK1600_CTX_x4 ctx;
+
+  uint8_t random_in[BATCHED_x4][RAND_BYTES_x4];
+  uint8_t digest[BATCHED_x4][RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE];
+  uint8_t digest_x4[BATCHED_x4][RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE];
+
+  // Test |SHAKE128_Init_x4|, |SHAKE128_Absorb_once_x4|, and |SHAKE128_Squeezeblocks_x4| functions
+  // Assert success when digest and digest_x4 values are equal
+  for (int i = 0; i < NUM_TESTS; i++) {
+    for (int j = 0; j < BATCHED_x4; j++) {
+      OPENSSL_memset(digest[j], 0, RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE);
+      OPENSSL_memset(digest_x4[j], 0, RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE);
+
+      ASSERT_TRUE(RAND_bytes(random_in[j], RAND_BYTES_x4));
+      ASSERT_TRUE(SHAKE128(random_in[j], RAND_BYTES_x4, digest[j],
+                                              RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE));
+    }
+
+    // Compute one batched x4 SHAKE128
+    ASSERT_TRUE(SHAKE128_Init_x4(&ctx));
+    ASSERT_TRUE(SHAKE128_Absorb_once_x4(&ctx, random_in[0], random_in[1], random_in[2], random_in[3],
+                                                                                          RAND_BYTES_x4));
+    ASSERT_TRUE(SHAKE128_Squeezeblocks_x4(digest_x4[0], digest_x4[1], digest_x4[2], digest_x4[3],
+                                                        &ctx, RAND_OUT_BLCKS));
+
+    for (int j = 0; j < BATCHED_x4; j++) {
+      EXPECT_EQ(Bytes(digest_x4[j], RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE),
+                Bytes(digest[j], RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE));
+    }
+  }
+
+  // Test |SHAKE256_x4| function
+  // Assert success when digest and digest_x4 values are equal
+  for (int i = 0; i < NUM_TESTS; i++) {
+    for (int j = 0; j < BATCHED_x4; j++) {
+      OPENSSL_memset(digest[j], 0, RAND_OUT_BLCKS);
+      OPENSSL_memset(digest_x4[j], 0, RAND_OUT_BLCKS);
+
+      ASSERT_TRUE(RAND_bytes(random_in[j], RAND_BYTES_x4));
+      SHAKE256(random_in[j], RAND_BYTES_x4, digest[j], RAND_OUT_BLCKS);
+    }
+
+    // Compute one batched x4 SHAKE128
+    ASSERT_TRUE(SHAKE256_x4(random_in[0], random_in[1], random_in[2], random_in[3], RAND_BYTES_x4,
+                            digest_x4[0], digest_x4[1], digest_x4[2], digest_x4[3], RAND_OUT_BLCKS));
+
+    for (int j = 0; j < BATCHED_x4; j++) {
+      EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_x4[j], RAND_OUT_BLCKS)),
+                EncodeHex(bssl::MakeConstSpan(digest[j], RAND_OUT_BLCKS)));
+    }
+  }
+}
+
+TEST(SHAKETest, NISTTestVectors) {
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE128VariableOut.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  test_vec.NISTTestVectors_SHAKE(EVP_shake128());
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE256VariableOut.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  test_vec.NISTTestVectors_SHAKE(EVP_shake256());
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE128VariableOut.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  test_vec.NISTTestVectors_SHAKESqueeze(EVP_shake128());
+                });
+  FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE256VariableOut.txt",
+                [](FileTest *t) {
+                  SHA3TestVector test_vec;
+                  EXPECT_TRUE(test_vec.ReadFromFileTest(t));
+                  test_vec.NISTTestVectors_SHAKESqueeze(EVP_shake256());
+                });
+}
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha512.c
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha512.c
@@ -0,0 +1,663 @@
+// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openssl/sha.h>
+
+#include <string.h>
+
+#include <openssl/mem.h>
+
+#include "internal.h"
+#include "../../internal.h"
+
+
+// The 32-bit hash algorithms share a common byte-order neutral collector and
+// padding function implementations that operate on unaligned data,
+// ../digest/md32_common.h. SHA-512 is the only 64-bit hash algorithm, as of
+// this writing, so there is no need for a common collector/padding
+// implementation yet.
+
+static int sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha);
+
+int SHA384_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8);
+  sha->h[1] = UINT64_C(0x629a292a367cd507);
+  sha->h[2] = UINT64_C(0x9159015a3070dd17);
+  sha->h[3] = UINT64_C(0x152fecd8f70e5939);
+  sha->h[4] = UINT64_C(0x67332667ffc00b31);
+  sha->h[5] = UINT64_C(0x8eb44a8768581511);
+  sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7);
+  sha->h[7] = UINT64_C(0x47b5481dbefa4fa4);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA384_DIGEST_LENGTH;
+  return 1;
+}
+
+
+int SHA512_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0x6a09e667f3bcc908);
+  sha->h[1] = UINT64_C(0xbb67ae8584caa73b);
+  sha->h[2] = UINT64_C(0x3c6ef372fe94f82b);
+  sha->h[3] = UINT64_C(0xa54ff53a5f1d36f1);
+  sha->h[4] = UINT64_C(0x510e527fade682d1);
+  sha->h[5] = UINT64_C(0x9b05688c2b3e6c1f);
+  sha->h[6] = UINT64_C(0x1f83d9abfb41bd6b);
+  sha->h[7] = UINT64_C(0x5be0cd19137e2179);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA512_DIGEST_LENGTH;
+  return 1;
+}
+
+int SHA512_224_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0x8c3d37c819544da2);
+  sha->h[1] = UINT64_C(0x73e1996689dcd4d6);
+  sha->h[2] = UINT64_C(0x1dfab7ae32ff9c82);
+  sha->h[3] = UINT64_C(0x679dd514582f9fcf);
+  sha->h[4] = UINT64_C(0x0f6d2b697bd44da8);
+  sha->h[5] = UINT64_C(0x77e36f7304c48942);
+  sha->h[6] = UINT64_C(0x3f9d85a86a1d36c8);
+  sha->h[7] = UINT64_C(0x1112e6ad91d692a1);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA512_224_DIGEST_LENGTH;
+  return 1;
+}
+
+int SHA512_256_Init(SHA512_CTX *sha) {
+  sha->h[0] = UINT64_C(0x22312194fc2bf72c);
+  sha->h[1] = UINT64_C(0x9f555fa3c84c64c2);
+  sha->h[2] = UINT64_C(0x2393b86b6f53b151);
+  sha->h[3] = UINT64_C(0x963877195940eabd);
+  sha->h[4] = UINT64_C(0x96283ee2a88effe3);
+  sha->h[5] = UINT64_C(0xbe5e1e2553863992);
+  sha->h[6] = UINT64_C(0x2b0199fc2c85b8aa);
+  sha->h[7] = UINT64_C(0x0eb72ddc81c52ca2);
+
+  sha->Nl = 0;
+  sha->Nh = 0;
+  sha->num = 0;
+  sha->md_len = SHA512_256_DIGEST_LENGTH;
+  return 1;
+}
+
+OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA384_CHAINING_LENGTH,
+                      sha512_and_sha384_have_same_chaining_length)
+OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA512_224_CHAINING_LENGTH,
+                      sha512_and_sha512_224_have_same_chaining_length)
+OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA512_256_CHAINING_LENGTH,
+                      sha512_and_sha512_256_have_same_chaining_length)
+
+// sha512_init_from_state_impl is the implementation of
+// SHA512_Init_from_state and SHA224_Init_from_state
+// Note that the state h is always SHA512_CHAINING_LENGTH-byte long
+static int sha512_init_from_state_impl(SHA512_CTX *sha, int md_len,
+                                       const uint8_t h[SHA512_CHAINING_LENGTH],
+                                       uint64_t n) {
+  if(n % ((uint64_t) SHA512_CBLOCK * 8) != 0) {
+    // n is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  OPENSSL_memset(sha, 0, sizeof(SHA512_CTX));
+  sha->md_len = md_len;
+
+  const size_t out_words = SHA512_CHAINING_LENGTH / 8;
+  for (size_t i = 0; i < out_words; i++) {
+    sha->h[i] = CRYPTO_load_u64_be(h);
+    h += 8;
+  }
+
+  sha->Nh = 0;
+  sha->Nl = n;
+
+  return 1;
+}
+
+int SHA384_Init_from_state(SHA512_CTX *sha,
+                           const uint8_t h[SHA384_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha512_init_from_state_impl(sha, SHA384_DIGEST_LENGTH, h, n);
+}
+
+int SHA512_Init_from_state(SHA512_CTX *sha,
+                           const uint8_t h[SHA512_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha512_init_from_state_impl(sha, SHA512_DIGEST_LENGTH, h, n);
+}
+
+int SHA512_224_Init_from_state(SHA512_CTX *sha,
+                           const uint8_t h[SHA512_224_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha512_init_from_state_impl(sha, SHA512_224_DIGEST_LENGTH, h, n);
+}
+
+int SHA512_256_Init_from_state(SHA512_CTX *sha,
+                           const uint8_t h[SHA512_256_CHAINING_LENGTH],
+                           uint64_t n) {
+  return sha512_init_from_state_impl(sha, SHA512_256_DIGEST_LENGTH, h, n);
+}
+
+uint8_t *SHA384(const uint8_t *data, size_t len,
+                uint8_t out[SHA384_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA512_CTX ctx;
+  const int ok = SHA384_Init(&ctx) &&
+                 SHA384_Update(&ctx, data, len) &&
+                 SHA384_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA512(const uint8_t *data, size_t len,
+                uint8_t out[SHA512_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA512_CTX ctx;
+  const int ok = SHA512_Init(&ctx) &&
+                 SHA512_Update(&ctx, data, len) &&
+                 SHA512_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA512_224(const uint8_t *data, size_t len,
+                    uint8_t out[SHA512_224_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA512_CTX ctx;
+  const int ok = SHA512_224_Init(&ctx) &&
+                 SHA512_224_Update(&ctx, data, len) &&
+                 SHA512_224_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+uint8_t *SHA512_256(const uint8_t *data, size_t len,
+                    uint8_t out[SHA512_256_DIGEST_LENGTH]) {
+  // We have to verify that all the SHA services actually succeed before
+  // updating the indicator state, so we lock the state here.
+  FIPS_service_indicator_lock_state();
+  SHA512_CTX ctx;
+  const int ok = SHA512_256_Init(&ctx) &&
+                 SHA512_256_Update(&ctx, data, len) &&
+                 SHA512_256_Final(out, &ctx);
+  FIPS_service_indicator_unlock_state();
+  if(ok) {
+    FIPS_service_indicator_update_state();
+  }
+  OPENSSL_cleanse(&ctx, sizeof(ctx));
+  return out;
+}
+
+#if !defined(SHA512_ASM)
+static void sha512_block_data_order(uint64_t state[8], const uint8_t *in,
+                                    size_t num_blocks);
+#endif
+
+
+int SHA384_Final(uint8_t out[SHA384_DIGEST_LENGTH], SHA512_CTX *sha) {
+  // This function must be paired with |SHA384_Init|, which sets |sha->md_len|
+  // to |SHA384_DIGEST_LENGTH|.
+  assert(sha->md_len == SHA384_DIGEST_LENGTH);
+  return sha512_final_impl(out, SHA384_DIGEST_LENGTH, sha);
+}
+
+int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) {
+  return SHA512_Update(sha, data, len);
+}
+
+int SHA512_224_Update(SHA512_CTX *sha, const void *data, size_t len) {
+  return SHA512_Update(sha, data, len);
+}
+
+int SHA512_224_Final(uint8_t out[SHA512_224_DIGEST_LENGTH], SHA512_CTX *sha) {
+  // This function must be paired with |SHA512_224_Init|, which sets
+  // |sha->md_len| to |SHA512_224_DIGEST_LENGTH|.
+  assert(sha->md_len == SHA512_224_DIGEST_LENGTH);
+  return sha512_final_impl(out, SHA512_224_DIGEST_LENGTH, sha);
+}
+
+int SHA512_256_Update(SHA512_CTX *sha, const void *data, size_t len) {
+  return SHA512_Update(sha, data, len);
+}
+
+int SHA512_256_Final(uint8_t out[SHA512_256_DIGEST_LENGTH], SHA512_CTX *sha) {
+  // This function must be paired with |SHA512_256_Init|, which sets
+  // |sha->md_len| to |SHA512_256_DIGEST_LENGTH|.
+  assert(sha->md_len == SHA512_256_DIGEST_LENGTH);
+  return sha512_final_impl(out, SHA512_256_DIGEST_LENGTH, sha);
+}
+
+void SHA512_Transform(SHA512_CTX *c, const uint8_t block[SHA512_CBLOCK]) {
+  sha512_block_data_order(c->h, block, 1);
+}
+
+int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
+  uint64_t l;
+  uint8_t *p = c->p;
+  const uint8_t *data = in_data;
+
+  if (len == 0) {
+    return 1;
+  }
+
+  l = (c->Nl + (((uint64_t)len) << 3)) & UINT64_C(0xffffffffffffffff);
+  if (l < c->Nl) {
+    c->Nh++;
+  }
+  if (sizeof(len) >= 8) {
+    c->Nh += (((uint64_t)len) >> 61);
+  }
+  c->Nl = l;
+
+  if (c->num != 0) {
+    size_t n = sizeof(c->p) - c->num;
+
+    if (len < n) {
+      OPENSSL_memcpy(p + c->num, data, len);
+      c->num += (unsigned int)len;
+      return 1;
+    } else {
+      OPENSSL_memcpy(p + c->num, data, n), c->num = 0;
+      len -= n;
+      data += n;
+      sha512_block_data_order(c->h, p, 1);
+    }
+  }
+
+  if (len >= sizeof(c->p)) {
+    sha512_block_data_order(c->h, data, len / sizeof(c->p));
+    data += len;
+    len %= sizeof(c->p);
+    data -= len;
+  }
+
+  if (len != 0) {
+    OPENSSL_memcpy(p, data, len);
+    c->num = (int)len;
+  }
+
+  return 1;
+}
+
+int SHA512_Final(uint8_t out[SHA512_DIGEST_LENGTH], SHA512_CTX *sha) {
+  // Ideally we would assert |sha->md_len| is |SHA512_DIGEST_LENGTH| to match
+  // the size hint, but calling code often pairs |SHA384_Init| with
+  // |SHA512_Final| and expects |sha->md_len| to carry the size over.
+  //
+  // TODO(davidben): Add an assert and fix code to match them up.
+  return sha512_final_impl(out, sha->md_len, sha);
+}
+
+static int sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha) {
+  uint8_t *p = sha->p;
+  size_t n = sha->num;
+
+  p[n] = 0x80;  // There always is a room for one
+  n++;
+  if (n > (sizeof(sha->p) - 16)) {
+    OPENSSL_memset(p + n, 0, sizeof(sha->p) - n);
+    n = 0;
+    sha512_block_data_order(sha->h, p, 1);
+  }
+
+  OPENSSL_memset(p + n, 0, sizeof(sha->p) - 16 - n);
+  CRYPTO_store_u64_be(p + sizeof(sha->p) - 16, sha->Nh);
+  CRYPTO_store_u64_be(p + sizeof(sha->p) - 8, sha->Nl);
+
+  sha512_block_data_order(sha->h, p, 1);
+
+  if (out == NULL) {
+    // TODO(davidben): This NULL check is absent in other low-level hash 'final'
+    // functions and is one of the few places one can fail.
+    return 0;
+  }
+
+  const size_t out_words = md_len / 8;
+  assert(md_len % 8 == 0 || md_len == SHA512_224_DIGEST_LENGTH);
+  for (size_t i = 0; i < out_words; i++) {
+    CRYPTO_store_u64_be(out, sha->h[i]);
+    out += 8;
+  }
+
+  // SHA-512 and SHA-512/256 are aligned to 8-byte words, SHA-512/224 is not.
+  // If the digest size is not aligned to 8-byte words, we need to process the
+  // non-word-aligned "trailer".
+  if (md_len == SHA512_224_DIGEST_LENGTH) {
+    uint64_t trailer;
+    CRYPTO_store_u64_be(&trailer, sha->h[out_words]);
+    OPENSSL_memcpy(out, &trailer, SHA512_224_DIGEST_LENGTH % 8);
+  }
+
+  FIPS_service_indicator_update_state();
+  return 1;
+}
+
+// sha512_get_state_impl is the implementation of
+// SHA512_get_state and SHA224_get_state
+// Note that the state out_h is always SHA512_CHAINING_LENGTH-byte long
+static int sha512_get_state_impl(SHA512_CTX *ctx,
+                                 uint8_t out_h[SHA512_CHAINING_LENGTH],
+                                 uint64_t *out_n) {
+  if (ctx->Nl % ((uint64_t)SHA512_CBLOCK * 8) != 0) {
+    // ctx->Nl is not a multiple of the block size in bits, so it fails
+    return 0;
+  }
+
+  if (ctx->Nh != 0) {
+    // |sha512_get_state_impl| assumes that at most 2^64 bits have been
+    // processed by the hash function
+    return 0;
+  }
+
+  const size_t out_words = SHA512_CHAINING_LENGTH / 8;
+  for (size_t i = 0; i < out_words; i++) {
+    CRYPTO_store_u64_be(out_h, ctx->h[i]);
+    out_h += 8;
+  }
+
+  *out_n = ctx->Nl;  // we know that ctx->Nh = 0
+
+  return 1;
+}
+
+int SHA384_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA384_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha512_get_state_impl(ctx, out_h, out_n);
+}
+
+int SHA512_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha512_get_state_impl(ctx, out_h, out_n);
+}
+
+int SHA512_224_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_224_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha512_get_state_impl(ctx, out_h, out_n);
+}
+
+int SHA512_256_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENGTH],
+                     uint64_t *out_n) {
+  return sha512_get_state_impl(ctx, out_h, out_n);
+}
+
+#if !defined(SHA512_ASM)
+
+#if !defined(SHA512_ASM_NOHW)
+static const uint64_t K512[80] = {
+    UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd),
+    UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc),
+    UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019),
+    UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118),
+    UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe),
+    UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2),
+    UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1),
+    UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694),
+    UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3),
+    UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65),
+    UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483),
+    UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5),
+    UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210),
+    UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4),
+    UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725),
+    UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70),
+    UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926),
+    UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df),
+    UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8),
+    UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b),
+    UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001),
+    UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30),
+    UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910),
+    UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8),
+    UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53),
+    UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8),
+    UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb),
+    UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3),
+    UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60),
+    UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec),
+    UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9),
+    UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b),
+    UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207),
+    UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178),
+    UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6),
+    UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b),
+    UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493),
+    UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c),
+    UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a),
+    UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817),
+};
+
+#define Sigma0(x)                                        \
+  (CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \
+   CRYPTO_rotr_u64((x), 39))
+#define Sigma1(x)                                        \
+  (CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \
+   CRYPTO_rotr_u64((x), 41))
+#define sigma0(x) \
+  (CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7))
+#define sigma1(x) \
+  (CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6))
+
+#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
+#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+// This code should give better results on 32-bit CPU with less than
+// ~24 registers, both size and performance wise...
+static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in,
+                                         size_t num) {
+  uint64_t A, E, T;
+  uint64_t X[9 + 80], *F;
+  int i;
+
+  while (num--) {
+    F = X + 80;
+    A = state[0];
+    F[1] = state[1];
+    F[2] = state[2];
+    F[3] = state[3];
+    E = state[4];
+    F[5] = state[5];
+    F[6] = state[6];
+    F[7] = state[7];
+
+    for (i = 0; i < 16; i++, F--) {
+      T = CRYPTO_load_u64_be(in + i * 8);
+      F[0] = A;
+      F[4] = E;
+      F[8] = T;
+      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
+      E = F[3] + T;
+      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
+    }
+
+    for (; i < 80; i++, F--) {
+      T = sigma0(F[8 + 16 - 1]);
+      T += sigma1(F[8 + 16 - 14]);
+      T += F[8 + 16] + F[8 + 16 - 9];
+
+      F[0] = A;
+      F[4] = E;
+      F[8] = T;
+      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
+      E = F[3] + T;
+      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
+    }
+
+    state[0] += A;
+    state[1] += F[1];
+    state[2] += F[2];
+    state[3] += F[3];
+    state[4] += E;
+    state[5] += F[5];
+    state[6] += F[6];
+    state[7] += F[7];
+
+    in += 16 * 8;
+  }
+}
+
+#else
+
+#define ROUND_00_15(i, a, b, c, d, e, f, g, h)   \
+  do {                                           \
+    T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \
+    h = Sigma0(a) + Maj(a, b, c);                \
+    d += T1;                                     \
+    h += T1;                                     \
+  } while (0)
+
+#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X)   \
+  do {                                                 \
+    s0 = X[(j + 1) & 0x0f];                            \
+    s0 = sigma0(s0);                                   \
+    s1 = X[(j + 14) & 0x0f];                           \
+    s1 = sigma1(s1);                                   \
+    T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \
+    ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
+  } while (0)
+
+static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in,
+                                         size_t num) {
+  uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
+  uint64_t X[16];
+  int i;
+
+  while (num--) {
+
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
+
+    T1 = X[0] = CRYPTO_load_u64_be(in);
+    ROUND_00_15(0, a, b, c, d, e, f, g, h);
+    T1 = X[1] = CRYPTO_load_u64_be(in + 8);
+    ROUND_00_15(1, h, a, b, c, d, e, f, g);
+    T1 = X[2] = CRYPTO_load_u64_be(in + 2 * 8);
+    ROUND_00_15(2, g, h, a, b, c, d, e, f);
+    T1 = X[3] = CRYPTO_load_u64_be(in + 3 * 8);
+    ROUND_00_15(3, f, g, h, a, b, c, d, e);
+    T1 = X[4] = CRYPTO_load_u64_be(in + 4 * 8);
+    ROUND_00_15(4, e, f, g, h, a, b, c, d);
+    T1 = X[5] = CRYPTO_load_u64_be(in + 5 * 8);
+    ROUND_00_15(5, d, e, f, g, h, a, b, c);
+    T1 = X[6] = CRYPTO_load_u64_be(in + 6 * 8);
+    ROUND_00_15(6, c, d, e, f, g, h, a, b);
+    T1 = X[7] = CRYPTO_load_u64_be(in + 7 * 8);
+    ROUND_00_15(7, b, c, d, e, f, g, h, a);
+    T1 = X[8] = CRYPTO_load_u64_be(in + 8 * 8);
+    ROUND_00_15(8, a, b, c, d, e, f, g, h);
+    T1 = X[9] = CRYPTO_load_u64_be(in + 9 * 8);
+    ROUND_00_15(9, h, a, b, c, d, e, f, g);
+    T1 = X[10] = CRYPTO_load_u64_be(in + 10 * 8);
+    ROUND_00_15(10, g, h, a, b, c, d, e, f);
+    T1 = X[11] = CRYPTO_load_u64_be(in + 11 * 8);
+    ROUND_00_15(11, f, g, h, a, b, c, d, e);
+    T1 = X[12] = CRYPTO_load_u64_be(in + 12 * 8);
+    ROUND_00_15(12, e, f, g, h, a, b, c, d);
+    T1 = X[13] = CRYPTO_load_u64_be(in + 13 * 8);
+    ROUND_00_15(13, d, e, f, g, h, a, b, c);
+    T1 = X[14] = CRYPTO_load_u64_be(in + 14 * 8);
+    ROUND_00_15(14, c, d, e, f, g, h, a, b);
+    T1 = X[15] = CRYPTO_load_u64_be(in + 15 * 8);
+    ROUND_00_15(15, b, c, d, e, f, g, h, a);
+
+    for (i = 16; i < 80; i += 16) {
+      ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
+      ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X);
+      ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X);
+      ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X);
+      ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X);
+      ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X);
+      ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X);
+      ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X);
+      ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X);
+      ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X);
+      ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X);
+      ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X);
+      ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X);
+      ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X);
+      ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X);
+      ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
+    }
+
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
+
+    in += 16 * 8;
+  }
+}
+
+#endif
+
+#endif  // !SHA512_ASM_NOHW
+
+static void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
+                                    size_t num) {
+#if defined(SHA512_ASM_HW)
+  if (sha512_hw_capable()) {
+    sha512_block_data_order_hw(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+  if (sha512_avx_capable()) {
+    sha512_block_data_order_avx(state, data, num);
+    return;
+  }
+#endif
+#if defined(SHA512_ASM_NEON)
+  if (CRYPTO_is_NEON_capable()) {
+    sha512_block_data_order_neon(state, data, num);
+    return;
+  }
+#endif
+  sha512_block_data_order_nohw(state, data, num);
+}
+
+#endif  // !SHA512_ASM
+
+#undef Sigma0
+#undef Sigma1
+#undef sigma0
+#undef sigma1
+#undef Ch
+#undef Maj
+#undef ROUND_00_15
+#undef ROUND_16_80
--- a/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha_test.cc
+++ b/vendor/aws-lc-sys/aws-lc/crypto/fipsmodule/sha/sha_test.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2018, Google Inc.
+// SPDX-License-Identifier: ISC
+
+#include <openssl/sha.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "../../test/abi_test.h"
+#include "internal.h"
+
+#if defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC)
+TEST(SHATest, SHA1ABI) {
+  SHA_CTX ctx;
+  SHA1_Init(&ctx);
+
+  static const uint8_t kBuf[SHA_CBLOCK * 8] = {0};
+  for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA1_ASM)
+    CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA1_ASM_HW)
+    if (sha1_hw_capable()) {
+      CHECK_ABI(sha1_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+    if (sha1_avx2_capable()) {
+      CHECK_ABI(sha1_block_data_order_avx2, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+    if (sha1_avx_capable()) {
+      CHECK_ABI(sha1_block_data_order_avx, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_SSSE3)
+    if (sha1_ssse3_capable()) {
+      CHECK_ABI(sha1_block_data_order_ssse3, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA1_ASM_NOHW)
+    CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
+}
+
+TEST(SHATest, SHA256ABI) {
+  SHA256_CTX ctx;
+  SHA256_Init(&ctx);
+
+  static const uint8_t kBuf[SHA256_CBLOCK * 8] = {0};
+  for (size_t blocks : {1, 2, 4, 8}) {
+#if defined(SHA256_ASM)
+    CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA256_ASM_HW)
+    if (sha256_hw_capable()) {
+      CHECK_ABI(sha256_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+    if (sha256_avx_capable()) {
+      CHECK_ABI(sha256_block_data_order_avx, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA256_ASM_SSSE3)
+    if (sha256_ssse3_capable()) {
+      CHECK_ABI(sha256_block_data_order_ssse3, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA256_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA256_ASM_NOHW)
+    CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
+}
+
+TEST(SHATest, SHA512ABI) {
+  SHA512_CTX ctx;
+  SHA512_Init(&ctx);
+
+  static const uint8_t kBuf[SHA512_CBLOCK * 4] = {0};
+  for (size_t blocks : {1, 2, 3, 4}) {
+#if defined(SHA512_ASM)
+    CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, blocks);
+#endif
+#if defined(SHA512_ASM_HW)
+    if (sha512_hw_capable()) {
+      CHECK_ABI(sha512_block_data_order_hw, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
+    if (sha512_avx_capable()) {
+      CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA512_ASM_NEON)
+    if (CRYPTO_is_NEON_capable()) {
+      CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
+    }
+#endif
+#if defined(SHA512_ASM_NOHW)
+    CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
+#endif
+  }
+}
+
+#endif // defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC)