chore: checkpoint before Python removal

2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-armx.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-armx.S
@@ -0,0 +1,867 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.section	.rodata
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	aes_hw_set_encrypt_key
+
+.def aes_hw_set_encrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_encrypt_key:
+.cfi_startproc
+Lenc_key:
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	adrp	x9,BORINGSSL_function_hit
+	add	x9, x9, :lo12:BORINGSSL_function_hit
+	mov	w10, #1
+	strb	w10, [x9,#3] // kFlag_aes_hw_set_encrypt_key
+#endif
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+.cfi_def_cfa_offset	16
+.cfi_offset	x29, -16
+.cfi_offset	x30, -8
+	add	x29,sp,#0
+.cfi_def_cfa	x29, 16
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	Lenc_key_abort
+	cmp	x2,#0
+	b.eq	Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon
+	add	x3,x3,:lo12:Lrcon
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+.cfi_restore	x29
+.cfi_def_cfa_offset	0
+	ret
+.cfi_endproc
+
+
+.globl	aes_hw_set_decrypt_key
+
+.def aes_hw_set_decrypt_key
+   .type 32
+.endef
+.align	5
+aes_hw_set_decrypt_key:
+.cfi_startproc
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+.cfi_def_cfa_offset	16
+.cfi_offset	x29, -16
+.cfi_offset	x30, -8
+	add	x29,sp,#0
+.cfi_def_cfa	x29, 16
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+.cfi_restore	x29
+.cfi_restore	x30
+.cfi_def_cfa_offset	0
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.cfi_endproc
+
+.globl	aes_hw_encrypt
+
+.def aes_hw_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_encrypt:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	adrp	x9,BORINGSSL_function_hit
+	add	x9, x9, :lo12:BORINGSSL_function_hit
+	mov	w10, #1
+	strb	w10, [x9,#1] // kFlag_aes_hw_encrypt
+#endif
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.cfi_endproc
+
+.globl	aes_hw_decrypt
+
+.def aes_hw_decrypt
+   .type 32
+.endef
+.align	5
+aes_hw_decrypt:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	adrp	x9,BORINGSSL_function_hit
+	add	x9, x9, :lo12:BORINGSSL_function_hit
+	mov	w10, #1
+	strb	w10, [x9,#1] // kFlag_aes_hw_encrypt
+#endif
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+.cfi_endproc
+
+.globl	aes_hw_cbc_encrypt
+
+.def aes_hw_cbc_encrypt
+   .type 32
+.endef
+.align	5
+aes_hw_cbc_encrypt:
+.cfi_startproc
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+.cfi_def_cfa_offset	16
+.cfi_offset	x29, -16
+.cfi_offset	x30, -8
+	add	x29,sp,#0
+.cfi_def_cfa	x29, 16
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+.cfi_restore	x29
+.cfi_def_cfa_offset	0
+	ret
+.cfi_endproc
+
+.globl	aes_hw_ctr32_encrypt_blocks
+
+.def aes_hw_ctr32_encrypt_blocks
+   .type 32
+.endef
+.align	5
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+
+	adrp	x9,BORINGSSL_function_hit
+	add	x9, x9, :lo12:BORINGSSL_function_hit
+	mov	w10, #1
+	strb	w10, [x9] // kFlag_aes_hw_ctr32_encrypt_blocks
+#endif
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+.cfi_def_cfa_offset	16
+.cfi_offset	x29, -16
+.cfi_offset	x30, -8
+	add	x29,sp,#0
+.cfi_def_cfa	x29, 16
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+
+Lctr32_tail:
+	cmp	x2,#1
+	b.lt	Lctr32_done	// if len = 0, go to done
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	cbz	x12,Lctr32_done  // if step = 0 (len = 1), go to done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+.cfi_restore	x29
+.cfi_def_cfa_offset	0
+	ret
+.cfi_endproc
+
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/aesv8-gcm-armv8.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/armv8-mont.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/armv8-mont.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/bn-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/bn-armv8.S
@@ -0,0 +1,93 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_add_words
+
+.align	4
+bn_add_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Ladd_tail
+Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Ladd_loop
+
+Ladd_tail:
+	cbz	x3, Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Ladd_exit:
+	cset	x0, cs
+	ret
+.cfi_endproc
+
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+
+.globl	bn_sub_words
+
+.align	4
+bn_sub_words:
+.cfi_startproc
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, Lsub_tail
+Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, Lsub_loop
+
+Lsub_tail:
+	cbz	x3, Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+Lsub_exit:
+	cset	x0, cc
+	ret
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,341 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	gcm_init_neon
+
+.def gcm_init_neon
+   .type 32
+.endef
+.align	4
+gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	gcm_gmult_neon
+
+.def gcm_gmult_neon
+   .type 32
+.endef
+.align	4
+gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	gcm_ghash_neon
+
+.def gcm_ghash_neon
+   .type 32
+.endef
+.align	4
+gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks		// load constants
+	add	x9, x9, :lo12:Lmasks
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	.rodata
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/ghashv8-armx.S
@@ -0,0 +1,673 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch	armv8-a+crypto
+.globl	gcm_init_v8
+
+.def gcm_init_v8
+   .type 32
+.endef
+.align	4
+gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	ext	v20.16b, v20.16b, v20.16b, #8
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+        //calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull2	v0.1q,v20.2d,v20.2d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull	v2.1q,v20.1d,v20.1d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v17.16b,v0.16b,v18.16b
+
+	ext	v22.16b,v17.16b,v17.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d},[x0],#16	//store Htable[1..2]
+	st1	{v22.2d},[x0],#16	//store Htable[1..2]
+
+	//calculate H^3 and H^4
+	pmull2	v0.1q,v20.2d, v22.2d
+	pmull2	v5.1q,v22.2d,v22.2d
+	pmull	v2.1q,v20.1d, v22.1d
+	pmull	v7.1q,v22.1d,v22.1d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+
+	eor	v16.16b, v0.16b,v18.16b		//H^3
+	eor	v17.16b, v5.16b,v4.16b		//H^4
+
+	ext	v23.16b,v16.16b,v16.16b,#8		//Karatsuba pre-processing
+	ext	v25.16b,v17.16b,v17.16b,#8
+	ext	v18.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v23.16b
+	eor	v17.16b,v17.16b,v25.16b
+	eor	v18.16b,v18.16b,v22.16b
+	ext	v24.16b,v16.16b,v17.16b,#8	//pack Karatsuba pre-processed
+	st1	{v23.2d,v24.2d,v25.2d},[x0],#48	//store Htable[3..5]
+
+	//calculate H^5 and H^6
+	pmull2	v0.1q,v22.2d, v23.2d
+	pmull2	v5.1q,v23.2d,v23.2d
+	pmull	v2.1q,v22.1d, v23.1d
+	pmull	v7.1q,v23.1d,v23.1d
+	pmull	v1.1q,v16.1d,v18.1d
+	pmull	v6.1q,v16.1d,v16.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+
+	eor	v16.16b,v0.16b,v18.16b		    //H^5
+	eor	v17.16b,v5.16b,v4.16b		    //H^6
+
+	ext	v26.16b, v16.16b, v16.16b,#8		//Karatsuba pre-processing
+	ext	v28.16b, v17.16b, v17.16b,#8
+	ext	v18.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v26.16b
+	eor	v17.16b,v17.16b,v28.16b
+	eor	v18.16b,v18.16b,v22.16b
+	ext	v27.16b,v16.16b,v17.16b,#8	//pack Karatsuba pre-processed
+	st1	{v26.2d,v27.2d,v28.2d},[x0],#48	//store Htable[6..8]
+
+	//calculate H^7 and H^8
+	pmull2	v0.1q,v22.2d,v26.2d
+	pmull2	v5.1q,v22.2d,v28.2d
+	pmull	v2.1q,v22.1d,v26.1d
+	pmull	v7.1q,v22.1d,v28.1d
+	pmull	v1.1q,v16.1d,v18.1d
+	pmull	v6.1q,v17.1d,v18.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v16.16b,v0.16b,v18.16b		    //H^7
+	eor	v17.16b,v5.16b,v4.16b		    //H^8
+
+	ext	v29.16b,v16.16b,v16.16b,#8		//Karatsuba pre-processing
+	ext	v31.16b,v17.16b,v17.16b,#8
+	eor	v16.16b,v16.16b,v29.16b
+	eor	v17.16b,v17.16b,v31.16b
+	ext	v30.16b,v16.16b,v17.16b,#8	//pack Karatsuba pre-processed
+	st1	{v29.2d,v30.2d,v31.2d},[x0]		//store Htable[9..11]
+	ret
+
+.globl	gcm_gmult_v8
+
+.def gcm_gmult_v8
+   .type 32
+.endef
+.align	4
+gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	ext	v20.16b,v20.16b,v20.16b,#8
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	gcm_ghash_v8
+
+.def gcm_ghash_v8
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	ext	v20.16b,v20.16b,v20.16b,#8
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	ext	v22.16b,v22.16b,v22.16b,#8
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.def gcm_ghash_v8_4x
+   .type 32
+.endef
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	ext	v20.16b,v20.16b,v20.16b,#8
+	ext	v22.16b,v22.16b,v22.16b,#8
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	ext	v26.16b,v26.16b,v26.16b,#8
+	ext	v28.16b,v28.16b,v28.16b,#8
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/keccak1600-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/keccak1600-armv8.S
@@ -0,0 +1,251 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+.text
+.align	8	// strategic alignment and padding that allows to use
+		// address value as loop termination condition...
+.quad	0,0,0,0,0,0,0,0
+
+iotas_hw:
+.quad	0x0000000000000001
+.quad	0x0000000000008082
+.quad	0x800000000000808a
+.quad	0x8000000080008000
+.quad	0x000000000000808b
+.quad	0x0000000080000001
+.quad	0x8000000080008081
+.quad	0x8000000000008009
+.quad	0x000000000000008a
+.quad	0x0000000000000088
+.quad	0x0000000080008009
+.quad	0x000000008000000a
+.quad	0x000000008000808b
+.quad	0x800000000000008b
+.quad	0x8000000000008089
+.quad	0x8000000000008003
+.quad	0x8000000000008002
+.quad	0x8000000000000080
+.quad	0x000000000000800a
+.quad	0x800000008000000a
+.quad	0x8000000080008081
+.quad	0x8000000000008080
+.quad	0x0000000080000001
+.quad	0x8000000080008008
+
+.def KeccakF1600_int
+   .type 32
+.endef
+.align	5
+KeccakF1600_int:
+	AARCH64_SIGN_LINK_REGISTER
+	adr	x28,iotas_hw
+	stp	x28,x30,[sp,#16]		// 32 bytes on top are mine
+	b	Loop
+.align	4
+Loop:
+	////////////////////////////////////////// Theta
+	eor	x26,x0,x5
+	stp	x4,x9,[sp,#0]	// offload pair...
+	eor	x27,x1,x6
+	eor	x28,x2,x7
+	eor	x30,x3,x8
+	eor	x4,x4,x9
+	eor	x26,x26,x10
+	eor	x27,x27,x11
+	eor	x28,x28,x12
+	eor	x30,x30,x13
+	eor	x4,x4,x14
+	eor	x26,x26,x15
+	eor	x27,x27,x16
+	eor	x28,x28,x17
+	eor	x30,x30,x25
+	eor	x4,x4,x19
+	eor	x26,x26,x20
+	eor	x28,x28,x22
+	eor	x27,x27,x21
+	eor	x30,x30,x23
+	eor	x4,x4,x24
+	eor	x9,x26,x28,ror#63
+	eor	x1,x1,x9
+	eor	x6,x6,x9
+	eor	x11,x11,x9
+	eor	x16,x16,x9
+	eor	x21,x21,x9
+	eor	x9,x27,x30,ror#63
+	eor	x28,x28,x4,ror#63
+	eor	x30,x30,x26,ror#63
+	eor	x4,x4,x27,ror#63
+	eor	x27,   x2,x9		// mov	x27,x2
+	eor	x7,x7,x9
+	eor	x12,x12,x9
+	eor	x17,x17,x9
+	eor	x22,x22,x9
+	eor	x0,x0,x4
+	eor	x5,x5,x4
+	eor	x10,x10,x4
+	eor	x15,x15,x4
+	eor	x20,x20,x4
+	ldp	x4,x9,[sp,#0]	// re-load offloaded data
+	eor	x26,   x3,x28		// mov	x26,x3
+	eor	x8,x8,x28
+	eor	x13,x13,x28
+	eor	x25,x25,x28
+	eor	x23,x23,x28
+	eor	x28,   x4,x30		// mov	x28,x4
+	eor	x9,x9,x30
+	eor	x14,x14,x30
+	eor	x19,x19,x30
+	eor	x24,x24,x30
+	////////////////////////////////////////// Rho+Pi
+	mov	x30,x1
+	ror	x1,x6,#20
+	//mov	x27,x2
+	ror	x2,x12,#21
+	//mov	x26,x3
+	ror	x3,x25,#43
+	//mov	x28,x4
+	ror	x4,x24,#50
+	ror	x6,x9,#44
+	ror	x12,x13,#39
+	ror	x25,x17,#49
+	ror	x24,x21,#62
+	ror	x9,x22,#3
+	ror	x13,x19,#56
+	ror	x17,x11,#54
+	ror	x21,x8,#9
+	ror	x22,x14,#25
+	ror	x19,x23,#8
+	ror	x11,x7,#58
+	ror	x8,x16,#19
+	ror	x14,x20,#46
+	ror	x23,x15,#23
+	ror	x7,x10,#61
+	ror	x16,x5,#28
+	ror	x5,x26,#36
+	ror	x10,x30,#63
+	ror	x15,x28,#37
+	ror	x20,x27,#2
+	////////////////////////////////////////// Chi+Iota
+	bic	x26,x2,x1
+	bic	x27,x3,x2
+	bic	x28,x0,x4
+	bic	x30,x1,x0
+	eor	x0,x0,x26
+	bic	x26,x4,x3
+	eor	x1,x1,x27
+	ldr	x27,[sp,#16]
+	eor	x3,x3,x28
+	eor	x4,x4,x30
+	eor	x2,x2,x26
+	ldr	x30,[x27],#8		// Iota[i++]
+	bic	x26,x7,x6
+	tst	x27,#255			// are we done?
+	str	x27,[sp,#16]
+	bic	x27,x8,x7
+	bic	x28,x5,x9
+	eor	x0,x0,x30		// A[0][0] ^= Iota
+	bic	x30,x6,x5
+	eor	x5,x5,x26
+	bic	x26,x9,x8
+	eor	x6,x6,x27
+	eor	x8,x8,x28
+	eor	x9,x9,x30
+	eor	x7,x7,x26
+	bic	x26,x12,x11
+	bic	x27,x13,x12
+	bic	x28,x10,x14
+	bic	x30,x11,x10
+	eor	x10,x10,x26
+	bic	x26,x14,x13
+	eor	x11,x11,x27
+	eor	x13,x13,x28
+	eor	x14,x14,x30
+	eor	x12,x12,x26
+	bic	x26,x17,x16
+	bic	x27,x25,x17
+	bic	x28,x15,x19
+	bic	x30,x16,x15
+	eor	x15,x15,x26
+	bic	x26,x19,x25
+	eor	x16,x16,x27
+	eor	x25,x25,x28
+	eor	x19,x19,x30
+	eor	x17,x17,x26
+	bic	x26,x22,x21
+	bic	x27,x23,x22
+	bic	x28,x20,x24
+	bic	x30,x21,x20
+	eor	x20,x20,x26
+	bic	x26,x24,x23
+	eor	x21,x21,x27
+	eor	x23,x23,x28
+	eor	x24,x24,x30
+	eor	x22,x22,x26
+	bne	Loop
+	ldr	x30,[sp,#24]
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	KeccakF1600_hw
+
+.def KeccakF1600_hw
+   .type 32
+.endef
+.align	5
+KeccakF1600_hw:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#48
+	str	x0,[sp,#32]			// offload argument
+	mov	x26,x0
+	ldp	x0,x1,[x0,#16*0]
+	ldp	x2,x3,[x26,#16*1]
+	ldp	x4,x5,[x26,#16*2]
+	ldp	x6,x7,[x26,#16*3]
+	ldp	x8,x9,[x26,#16*4]
+	ldp	x10,x11,[x26,#16*5]
+	ldp	x12,x13,[x26,#16*6]
+	ldp	x14,x15,[x26,#16*7]
+	ldp	x16,x17,[x26,#16*8]
+	ldp	x25,x19,[x26,#16*9]
+	ldp	x20,x21,[x26,#16*10]
+	ldp	x22,x23,[x26,#16*11]
+	ldr	x24,[x26,#16*12]
+	bl	KeccakF1600_int
+	ldr	x26,[sp,#32]
+	stp	x0,x1,[x26,#16*0]
+	stp	x2,x3,[x26,#16*1]
+	stp	x4,x5,[x26,#16*2]
+	stp	x6,x7,[x26,#16*3]
+	stp	x8,x9,[x26,#16*4]
+	stp	x10,x11,[x26,#16*5]
+	stp	x12,x13,[x26,#16*6]
+	stp	x14,x15,[x26,#16*7]
+	stp	x16,x17,[x26,#16*8]
+	stp	x25,x19,[x26,#16*9]
+	stp	x20,x21,[x26,#16*10]
+	stp	x22,x23,[x26,#16*11]
+	str	x24,[x26,#16*12]
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#48
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	75,101,99,99,97,107,45,49,54,48,48,32,112,101,114,109,117,116,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/md5-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/md5-armv8.S
@@ -0,0 +1,687 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+
+.text
+.globl	md5_block_asm_data_order
+
+
+md5_block_asm_data_order:
+.cfi_startproc
+        // Save all callee-saved registers
+	stp	x19,x20,[sp,#-80]!
+.cfi_def_cfa_offset	80
+.cfi_offset	x19, -80
+.cfi_offset	x20, -72
+	stp	x21,x22,[sp,#16]
+.cfi_offset	x21, -64
+.cfi_offset	x22, -56
+	stp	x23,x24,[sp,#32]
+.cfi_offset	x23, -48
+.cfi_offset	x24, -40
+	stp	x25,x26,[sp,#48]
+.cfi_offset	x25, -32
+.cfi_offset	x26, -24
+	stp	x27,x28,[sp,#64]
+.cfi_offset	x27, -16
+.cfi_offset	x28, -8
+
+	ldp	w10, w11, [x0, #0]        // Load MD5 state->A and state->B
+	ldp	w12, w13, [x0, #8]        // Load MD5 state->C and state->D
+.align	5
+Lmd5_blocks_loop:
+	eor	x17, x12, x13             // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	and	x16, x17, x11             // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	ldp	x15, x3, [x1]             // Load 4 words of input data0 M[0]/0
+	eor	x14, x16, x13             // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x9, #0xa478              // Load lower half of constant 0xd76aa478
+	movk	x9, #0xd76a, lsl #16     // Load upper half of constant 0xd76aa478
+	add	w8, w10, w15              // Add dest value
+	add	w7, w8, w9                // Add constant 0xd76aa478
+	add	w6, w7, w14               // Add aux function result
+	ror	w6, w6, #25               // Rotate left s=7 bits
+	eor	x5, x11, x12              // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w4, w11, w6               // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0])
+	and	x8, x5, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x17, x8, x12              // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x16, #0xb756             // Load lower half of constant 0xe8c7b756
+	movk	x16, #0xe8c7, lsl #16    // Load upper half of constant 0xe8c7b756
+	lsr	x20, x15, #32             // Right shift high input value containing M[1]
+	add	w9, w13, w20              // Add dest value
+	add	w7, w9, w16               // Add constant 0xe8c7b756
+	add	w14, w7, w17              // Add aux function result
+	ror	w14, w14, #20             // Rotate left s=12 bits
+	eor	x6, x4, x11               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w5, w4, w14               // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1])
+	and	x8, x6, x5                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x9, x8, x11               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x16, #0x70db             // Load lower half of constant 0x242070db
+	movk	x16, #0x2420, lsl #16    // Load upper half of constant 0x242070db
+	add	w7, w12, w3               // Add dest value
+	add	w17, w7, w16              // Add constant 0x242070db
+	add	w14, w17, w9              // Add aux function result
+	ror	w14, w14, #15             // Rotate left s=17 bits
+	eor	x6, x5, x4                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w8, w5, w14               // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2])
+	and	x7, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x16, x7, x4               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x9, #0xceee              // Load lower half of constant 0xc1bdceee
+	movk	x9, #0xc1bd, lsl #16     // Load upper half of constant 0xc1bdceee
+	lsr	x21, x3, #32              // Right shift high input value containing M[3]
+	add	w14, w11, w21             // Add dest value
+	add	w6, w14, w9               // Add constant 0xc1bdceee
+	add	w7, w6, w16               // Add aux function result
+	ror	w7, w7, #10               // Rotate left s=22 bits
+	eor	x17, x8, x5               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w9, w8, w7                // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3])
+	ldp	x14, x7, [x1, #16]        // Load 4 words of input data0 M[4]/0w
+	and	x16, x17, x9              // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x16, x5               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x16, #0xfaf              // Load lower half of constant 0xf57c0faf
+	movk	x16, #0xf57c, lsl #16    // Load upper half of constant 0xf57c0faf
+	add	w17, w4, w14              // Add dest value
+	add	w16, w17, w16             // Add constant 0xf57c0faf
+	add	w4, w16, w6               // Add aux function result
+	ror	w4, w4, #25               // Rotate left s=7 bits
+	eor	x16, x9, x8               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w17, w9, w4               // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4])
+	and	x16, x16, x17             // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x16, x8               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x4, #0xc62a              // Load lower half of constant 0x4787c62a
+	movk	x4, #0x4787, lsl #16     // Load upper half of constant 0x4787c62a
+	lsr	x22, x14, #32             // Right shift high input value containing M[5]
+	add	w16, w5, w22              // Add dest value
+	add	w16, w16, w4              // Add constant 0x4787c62a
+	add	w5, w16, w6               // Add aux function result
+	ror	w5, w5, #20               // Rotate left s=12 bits
+	eor	x4, x17, x9               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w19, w17, w5              // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5])
+	and	x6, x4, x19               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x5, x6, x9                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x4, #0x4613              // Load lower half of constant 0xa8304613
+	movk	x4, #0xa830, lsl #16     // Load upper half of constant 0xa8304613
+	add	w6, w8, w7                // Add dest value
+	add	w8, w6, w4                // Add constant 0xa8304613
+	add	w4, w8, w5                // Add aux function result
+	ror	w4, w4, #15               // Rotate left s=17 bits
+	eor	x6, x19, x17              // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w8, w19, w4               // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6])
+	and	x5, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x4, x5, x17               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x6, #0x9501              // Load lower half of constant 0xfd469501
+	movk	x6, #0xfd46, lsl #16     // Load upper half of constant 0xfd469501
+	lsr	x23, x7, #32              // Right shift high input value containing M[7]
+	add	w9, w9, w23               // Add dest value
+	add	w5, w9, w6                // Add constant 0xfd469501
+	add	w9, w5, w4                // Add aux function result
+	ror	w9, w9, #10               // Rotate left s=22 bits
+	eor	x6, x8, x19               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w4, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7])
+	ldp	x5, x16, [x1, #32]        // Load 4 words of input data0 M[8]/0
+	and	x9, x6, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x9, x19               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x9, #0x98d8              // Load lower half of constant 0x698098d8
+	movk	x9, #0x6980, lsl #16     // Load upper half of constant 0x698098d8
+	add	w17, w17, w5              // Add dest value
+	add	w9, w17, w9               // Add constant 0x698098d8
+	add	w17, w9, w6               // Add aux function result
+	ror	w17, w17, #25             // Rotate left s=7 bits
+	eor	x9, x4, x8                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w6, w4, w17               // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8])
+	and	x17, x9, x6               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x9, x17, x8               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x17, #0xf7af             // Load lower half of constant 0x8b44f7af
+	movk	x17, #0x8b44, lsl #16    // Load upper half of constant 0x8b44f7af
+	lsr	x24, x5, #32              // Right shift high input value containing M[9]
+	add	w19, w19, w24             // Add dest value
+	add	w17, w19, w17             // Add constant 0x8b44f7af
+	add	w19, w17, w9              // Add aux function result
+	ror	w19, w19, #20             // Rotate left s=12 bits
+	eor	x9, x6, x4                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w17, w6, w19              // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9])
+	and	x9, x9, x17               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x9, x9, x4                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x11, #0x5bb1             // Load lower half of constant 0xffff5bb1
+	movk	x11, #0xffff, lsl #16    // Load upper half of constant 0xffff5bb1
+	add	w8, w8, w16               // Add dest value
+	add	w8, w8, w11               // Add constant 0xffff5bb1
+	add	w8, w8, w9                // Add aux function result
+	ror	w8, w8, #15               // Rotate left s=17 bits
+	eor	x9, x17, x6               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w8, w17, w8               // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10])
+	and	x9, x9, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x9, x9, x6                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x11, #0xd7be             // Load lower half of constant 0x895cd7be
+	movk	x11, #0x895c, lsl #16    // Load upper half of constant 0x895cd7be
+	lsr	x25, x16, #32             // Right shift high input value containing M[11]
+	add	w4, w4, w25               // Add dest value
+	add	w4, w4, w11               // Add constant 0x895cd7be
+	add	w9, w4, w9                // Add aux function result
+	ror	w9, w9, #10               // Rotate left s=22 bits
+	eor	x4, x8, x17               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w9, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11])
+	ldp	x11, x12, [x1, #48]       // Load 4 words of input data0 M[12]/0
+	and	x4, x4, x9                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x4, x4, x17               // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x19, #0x1122             // Load lower half of constant 0x6b901122
+	movk	x19, #0x6b90, lsl #16    // Load upper half of constant 0x6b901122
+	add	w6, w6, w11               // Add dest value
+	add	w6, w6, w19               // Add constant 0x6b901122
+	add	w4, w6, w4                // Add aux function result
+	ror	w4, w4, #25               // Rotate left s=7 bits
+	eor	x6, x9, x8                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w4, w9, w4                // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12])
+	and	x6, x6, x4                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x6, x8                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x19, #0x7193             // Load lower half of constant 0xfd987193
+	movk	x19, #0xfd98, lsl #16    // Load upper half of constant 0xfd987193
+	lsr	x26, x11, #32             // Right shift high input value containing M[13]
+	add	w17, w17, w26             // Add dest value
+	add	w17, w17, w19             // Add constant 0xfd987193
+	add	w17, w17, w6              // Add aux function result
+	ror	w17, w17, #20             // Rotate left s=12 bits
+	eor	x6, x4, x9                // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w17, w4, w17              // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13])
+	and	x6, x6, x17               // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x6, x9                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x13, #0x438e             // Load lower half of constant 0xa679438e
+	movk	x13, #0xa679, lsl #16    // Load upper half of constant 0xa679438e
+	add	w8, w8, w12               // Add dest value
+	add	w8, w8, w13               // Add constant 0xa679438e
+	add	w8, w8, w6                // Add aux function result
+	ror	w8, w8, #15               // Rotate left s=17 bits
+	eor	x6, x17, x4               // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	add	w8, w17, w8               // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14])
+	and	x6, x6, x8                // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	eor	x6, x6, x4                // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
+	movz	x13, #0x821              // Load lower half of constant 0x49b40821
+	movk	x13, #0x49b4, lsl #16    // Load upper half of constant 0x49b40821
+	lsr	x27, x12, #32             // Right shift high input value containing M[15]
+	add	w9, w9, w27               // Add dest value
+	add	w9, w9, w13               // Add constant 0x49b40821
+	add	w9, w9, w6                // Add aux function result
+	ror	w9, w9, #10               // Rotate left s=22 bits
+	bic	x6, x8, x17               // Aux function round 2 (~z & y)
+	add	w9, w8, w9                // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
+	movz	x13, #0x2562             // Load lower half of constant 0xf61e2562
+	movk	x13, #0xf61e, lsl #16    // Load upper half of constant 0xf61e2562
+	add	w4, w4, w20               // Add dest value
+	add	w4, w4, w13               // Add constant 0xf61e2562
+	and	x13, x9, x17              // Aux function round 2 (x & z)
+	add	w4, w4, w6                // Add (~z & y)
+	add	w4, w4, w13               // Add (x & z)
+	ror	w4, w4, #27               // Rotate left s=5 bits
+	bic	x6, x9, x8                // Aux function round 2 (~z & y)
+	add	w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
+	movz	x13, #0xb340             // Load lower half of constant 0xc040b340
+	movk	x13, #0xc040, lsl #16    // Load upper half of constant 0xc040b340
+	add	w17, w17, w7              // Add dest value
+	add	w17, w17, w13             // Add constant 0xc040b340
+	and	x13, x4, x8               // Aux function round 2 (x & z)
+	add	w17, w17, w6              // Add (~z & y)
+	add	w17, w17, w13             // Add (x & z)
+	ror	w17, w17, #23             // Rotate left s=9 bits
+	bic	x6, x4, x9                // Aux function round 2 (~z & y)
+	add	w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
+	movz	x13, #0x5a51             // Load lower half of constant 0x265e5a51
+	movk	x13, #0x265e, lsl #16    // Load upper half of constant 0x265e5a51
+	add	w8, w8, w25               // Add dest value
+	add	w8, w8, w13               // Add constant 0x265e5a51
+	and	x13, x17, x9              // Aux function round 2 (x & z)
+	add	w8, w8, w6                // Add (~z & y)
+	add	w8, w8, w13               // Add (x & z)
+	ror	w8, w8, #18               // Rotate left s=14 bits
+	bic	x6, x17, x4               // Aux function round 2 (~z & y)
+	add	w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
+	movz	x13, #0xc7aa             // Load lower half of constant 0xe9b6c7aa
+	movk	x13, #0xe9b6, lsl #16    // Load upper half of constant 0xe9b6c7aa
+	add	w9, w9, w15               // Add dest value
+	add	w9, w9, w13               // Add constant 0xe9b6c7aa
+	and	x13, x8, x4               // Aux function round 2 (x & z)
+	add	w9, w9, w6                // Add (~z & y)
+	add	w9, w9, w13               // Add (x & z)
+	ror	w9, w9, #12               // Rotate left s=20 bits
+	bic	x6, x8, x17               // Aux function round 2 (~z & y)
+	add	w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
+	movz	x13, #0x105d             // Load lower half of constant 0xd62f105d
+	movk	x13, #0xd62f, lsl #16    // Load upper half of constant 0xd62f105d
+	add	w4, w4, w22               // Add dest value
+	add	w4, w4, w13               // Add constant 0xd62f105d
+	and	x13, x9, x17              // Aux function round 2 (x & z)
+	add	w4, w4, w6                // Add (~z & y)
+	add	w4, w4, w13               // Add (x & z)
+	ror	w4, w4, #27               // Rotate left s=5 bits
+	bic	x6, x9, x8                // Aux function round 2 (~z & y)
+	add	w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
+	movz	x13, #0x1453             // Load lower half of constant 0x2441453
+	movk	x13, #0x244, lsl #16     // Load upper half of constant 0x2441453
+	add	w17, w17, w16             // Add dest value
+	add	w17, w17, w13             // Add constant 0x2441453
+	and	x13, x4, x8               // Aux function round 2 (x & z)
+	add	w17, w17, w6              // Add (~z & y)
+	add	w17, w17, w13             // Add (x & z)
+	ror	w17, w17, #23             // Rotate left s=9 bits
+	bic	x6, x4, x9                // Aux function round 2 (~z & y)
+	add	w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
+	movz	x13, #0xe681             // Load lower half of constant 0xd8a1e681
+	movk	x13, #0xd8a1, lsl #16    // Load upper half of constant 0xd8a1e681
+	add	w8, w8, w27               // Add dest value
+	add	w8, w8, w13               // Add constant 0xd8a1e681
+	and	x13, x17, x9              // Aux function round 2 (x & z)
+	add	w8, w8, w6                // Add (~z & y)
+	add	w8, w8, w13               // Add (x & z)
+	ror	w8, w8, #18               // Rotate left s=14 bits
+	bic	x6, x17, x4               // Aux function round 2 (~z & y)
+	add	w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
+	movz	x13, #0xfbc8             // Load lower half of constant 0xe7d3fbc8
+	movk	x13, #0xe7d3, lsl #16    // Load upper half of constant 0xe7d3fbc8
+	add	w9, w9, w14               // Add dest value
+	add	w9, w9, w13               // Add constant 0xe7d3fbc8
+	and	x13, x8, x4               // Aux function round 2 (x & z)
+	add	w9, w9, w6                // Add (~z & y)
+	add	w9, w9, w13               // Add (x & z)
+	ror	w9, w9, #12               // Rotate left s=20 bits
+	bic	x6, x8, x17               // Aux function round 2 (~z & y)
+	add	w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
+	movz	x13, #0xcde6             // Load lower half of constant 0x21e1cde6
+	movk	x13, #0x21e1, lsl #16    // Load upper half of constant 0x21e1cde6
+	add	w4, w4, w24               // Add dest value
+	add	w4, w4, w13               // Add constant 0x21e1cde6
+	and	x13, x9, x17              // Aux function round 2 (x & z)
+	add	w4, w4, w6                // Add (~z & y)
+	add	w4, w4, w13               // Add (x & z)
+	ror	w4, w4, #27               // Rotate left s=5 bits
+	bic	x6, x9, x8                // Aux function round 2 (~z & y)
+	add	w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
+	movz	x13, #0x7d6              // Load lower half of constant 0xc33707d6
+	movk	x13, #0xc337, lsl #16    // Load upper half of constant 0xc33707d6
+	add	w17, w17, w12             // Add dest value
+	add	w17, w17, w13             // Add constant 0xc33707d6
+	and	x13, x4, x8               // Aux function round 2 (x & z)
+	add	w17, w17, w6              // Add (~z & y)
+	add	w17, w17, w13             // Add (x & z)
+	ror	w17, w17, #23             // Rotate left s=9 bits
+	bic	x6, x4, x9                // Aux function round 2 (~z & y)
+	add	w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
+	movz	x13, #0xd87              // Load lower half of constant 0xf4d50d87
+	movk	x13, #0xf4d5, lsl #16    // Load upper half of constant 0xf4d50d87
+	add	w8, w8, w21               // Add dest value
+	add	w8, w8, w13               // Add constant 0xf4d50d87
+	and	x13, x17, x9              // Aux function round 2 (x & z)
+	add	w8, w8, w6                // Add (~z & y)
+	add	w8, w8, w13               // Add (x & z)
+	ror	w8, w8, #18               // Rotate left s=14 bits
+	bic	x6, x17, x4               // Aux function round 2 (~z & y)
+	add	w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
+	movz	x13, #0x14ed             // Load lower half of constant 0x455a14ed
+	movk	x13, #0x455a, lsl #16    // Load upper half of constant 0x455a14ed
+	add	w9, w9, w5                // Add dest value
+	add	w9, w9, w13               // Add constant 0x455a14ed
+	and	x13, x8, x4               // Aux function round 2 (x & z)
+	add	w9, w9, w6                // Add (~z & y)
+	add	w9, w9, w13               // Add (x & z)
+	ror	w9, w9, #12               // Rotate left s=20 bits
+	bic	x6, x8, x17               // Aux function round 2 (~z & y)
+	add	w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
+	movz	x13, #0xe905             // Load lower half of constant 0xa9e3e905
+	movk	x13, #0xa9e3, lsl #16    // Load upper half of constant 0xa9e3e905
+	add	w4, w4, w26               // Add dest value
+	add	w4, w4, w13               // Add constant 0xa9e3e905
+	and	x13, x9, x17              // Aux function round 2 (x & z)
+	add	w4, w4, w6                // Add (~z & y)
+	add	w4, w4, w13               // Add (x & z)
+	ror	w4, w4, #27               // Rotate left s=5 bits
+	bic	x6, x9, x8                // Aux function round 2 (~z & y)
+	add	w4, w9, w4                // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
+	movz	x13, #0xa3f8             // Load lower half of constant 0xfcefa3f8
+	movk	x13, #0xfcef, lsl #16    // Load upper half of constant 0xfcefa3f8
+	add	w17, w17, w3              // Add dest value
+	add	w17, w17, w13             // Add constant 0xfcefa3f8
+	and	x13, x4, x8               // Aux function round 2 (x & z)
+	add	w17, w17, w6              // Add (~z & y)
+	add	w17, w17, w13             // Add (x & z)
+	ror	w17, w17, #23             // Rotate left s=9 bits
+	bic	x6, x4, x9                // Aux function round 2 (~z & y)
+	add	w17, w4, w17              // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
+	movz	x13, #0x2d9              // Load lower half of constant 0x676f02d9
+	movk	x13, #0x676f, lsl #16    // Load upper half of constant 0x676f02d9
+	add	w8, w8, w23               // Add dest value
+	add	w8, w8, w13               // Add constant 0x676f02d9
+	and	x13, x17, x9              // Aux function round 2 (x & z)
+	add	w8, w8, w6                // Add (~z & y)
+	add	w8, w8, w13               // Add (x & z)
+	ror	w8, w8, #18               // Rotate left s=14 bits
+	bic	x6, x17, x4               // Aux function round 2 (~z & y)
+	add	w8, w17, w8               // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
+	movz	x13, #0x4c8a             // Load lower half of constant 0x8d2a4c8a
+	movk	x13, #0x8d2a, lsl #16    // Load upper half of constant 0x8d2a4c8a
+	add	w9, w9, w11               // Add dest value
+	add	w9, w9, w13               // Add constant 0x8d2a4c8a
+	and	x13, x8, x4               // Aux function round 2 (x & z)
+	add	w9, w9, w6                // Add (~z & y)
+	add	w9, w9, w13               // Add (x & z)
+	eor	x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w9, w9, #12               // Rotate left s=20 bits
+	movz	x10, #0x3942             // Load lower half of constant 0xfffa3942
+	add	w9, w8, w9                // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12])
+	movk	x10, #0xfffa, lsl #16    // Load upper half of constant 0xfffa3942
+	add	w4, w4, w22               // Add dest value
+	eor	x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w4, w4, w10               // Add constant 0xfffa3942
+	add	w4, w4, w6                // Add aux function result
+	ror	w4, w4, #28               // Rotate left s=4 bits
+	eor	x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x10, #0xf681             // Load lower half of constant 0x8771f681
+	add	w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5])
+	movk	x10, #0x8771, lsl #16    // Load upper half of constant 0x8771f681
+	add	w17, w17, w5              // Add dest value
+	eor	x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w17, w17, w10             // Add constant 0x8771f681
+	add	w17, w17, w6              // Add aux function result
+	eor	x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w17, w17, #21             // Rotate left s=11 bits
+	movz	x13, #0x6122             // Load lower half of constant 0x6d9d6122
+	add	w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8])
+	movk	x13, #0x6d9d, lsl #16    // Load upper half of constant 0x6d9d6122
+	add	w8, w8, w25               // Add dest value
+	eor	x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w8, w8, w13               // Add constant 0x6d9d6122
+	add	w8, w8, w6                // Add aux function result
+	ror	w8, w8, #16               // Rotate left s=16 bits
+	eor	x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x13, #0x380c             // Load lower half of constant 0xfde5380c
+	add	w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11])
+	movk	x13, #0xfde5, lsl #16    // Load upper half of constant 0xfde5380c
+	add	w9, w9, w12               // Add dest value
+	eor	x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w9, w9, w13               // Add constant 0xfde5380c
+	add	w9, w9, w6                // Add aux function result
+	eor	x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w9, w9, #9                // Rotate left s=23 bits
+	movz	x10, #0xea44             // Load lower half of constant 0xa4beea44
+	add	w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14])
+	movk	x10, #0xa4be, lsl #16    // Load upper half of constant 0xa4beea44
+	add	w4, w4, w20               // Add dest value
+	eor	x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w4, w4, w10               // Add constant 0xa4beea44
+	add	w4, w4, w6                // Add aux function result
+	ror	w4, w4, #28               // Rotate left s=4 bits
+	eor	x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x10, #0xcfa9             // Load lower half of constant 0x4bdecfa9
+	add	w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1])
+	movk	x10, #0x4bde, lsl #16    // Load upper half of constant 0x4bdecfa9
+	add	w17, w17, w14             // Add dest value
+	eor	x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w17, w17, w10             // Add constant 0x4bdecfa9
+	add	w17, w17, w6              // Add aux function result
+	eor	x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w17, w17, #21             // Rotate left s=11 bits
+	movz	x13, #0x4b60             // Load lower half of constant 0xf6bb4b60
+	add	w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4])
+	movk	x13, #0xf6bb, lsl #16    // Load upper half of constant 0xf6bb4b60
+	add	w8, w8, w23               // Add dest value
+	eor	x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w8, w8, w13               // Add constant 0xf6bb4b60
+	add	w8, w8, w6                // Add aux function result
+	ror	w8, w8, #16               // Rotate left s=16 bits
+	eor	x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x13, #0xbc70             // Load lower half of constant 0xbebfbc70
+	add	w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7])
+	movk	x13, #0xbebf, lsl #16    // Load upper half of constant 0xbebfbc70
+	add	w9, w9, w16               // Add dest value
+	eor	x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w9, w9, w13               // Add constant 0xbebfbc70
+	add	w9, w9, w6                // Add aux function result
+	eor	x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w9, w9, #9                // Rotate left s=23 bits
+	movz	x10, #0x7ec6             // Load lower half of constant 0x289b7ec6
+	add	w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10])
+	movk	x10, #0x289b, lsl #16    // Load upper half of constant 0x289b7ec6
+	add	w4, w4, w26               // Add dest value
+	eor	x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w4, w4, w10               // Add constant 0x289b7ec6
+	add	w4, w4, w6                // Add aux function result
+	ror	w4, w4, #28               // Rotate left s=4 bits
+	eor	x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x10, #0x27fa             // Load lower half of constant 0xeaa127fa
+	add	w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13])
+	movk	x10, #0xeaa1, lsl #16    // Load upper half of constant 0xeaa127fa
+	add	w17, w17, w15             // Add dest value
+	eor	x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w17, w17, w10             // Add constant 0xeaa127fa
+	add	w17, w17, w6              // Add aux function result
+	eor	x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w17, w17, #21             // Rotate left s=11 bits
+	movz	x13, #0x3085             // Load lower half of constant 0xd4ef3085
+	add	w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0])
+	movk	x13, #0xd4ef, lsl #16    // Load upper half of constant 0xd4ef3085
+	add	w8, w8, w21               // Add dest value
+	eor	x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w8, w8, w13               // Add constant 0xd4ef3085
+	add	w8, w8, w6                // Add aux function result
+	ror	w8, w8, #16               // Rotate left s=16 bits
+	eor	x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x13, #0x1d05             // Load lower half of constant 0x4881d05
+	add	w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3])
+	movk	x13, #0x488, lsl #16     // Load upper half of constant 0x4881d05
+	add	w9, w9, w7                // Add dest value
+	eor	x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w9, w9, w13               // Add constant 0x4881d05
+	add	w9, w9, w6                // Add aux function result
+	eor	x6, x8, x17               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w9, w9, #9                // Rotate left s=23 bits
+	movz	x10, #0xd039             // Load lower half of constant 0xd9d4d039
+	add	w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6])
+	movk	x10, #0xd9d4, lsl #16    // Load upper half of constant 0xd9d4d039
+	add	w4, w4, w24               // Add dest value
+	eor	x6, x6, x9                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w4, w4, w10               // Add constant 0xd9d4d039
+	add	w4, w4, w6                // Add aux function result
+	ror	w4, w4, #28               // Rotate left s=4 bits
+	eor	x6, x9, x8                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x10, #0x99e5             // Load lower half of constant 0xe6db99e5
+	add	w4, w9, w4                // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9])
+	movk	x10, #0xe6db, lsl #16    // Load upper half of constant 0xe6db99e5
+	add	w17, w17, w11             // Add dest value
+	eor	x6, x6, x4                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w17, w17, w10             // Add constant 0xe6db99e5
+	add	w17, w17, w6              // Add aux function result
+	eor	x6, x4, x9                // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	ror	w17, w17, #21             // Rotate left s=11 bits
+	movz	x13, #0x7cf8             // Load lower half of constant 0x1fa27cf8
+	add	w17, w4, w17              // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12])
+	movk	x13, #0x1fa2, lsl #16    // Load upper half of constant 0x1fa27cf8
+	add	w8, w8, w27               // Add dest value
+	eor	x6, x6, x17               // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w8, w8, w13               // Add constant 0x1fa27cf8
+	add	w8, w8, w6                // Add aux function result
+	ror	w8, w8, #16               // Rotate left s=16 bits
+	eor	x6, x17, x4               // Begin aux function round 3 H(x,y,z)=(x^y^z)
+	movz	x13, #0x5665             // Load lower half of constant 0xc4ac5665
+	add	w8, w17, w8               // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15])
+	movk	x13, #0xc4ac, lsl #16    // Load upper half of constant 0xc4ac5665
+	add	w9, w9, w3                // Add dest value
+	eor	x6, x6, x8                // End aux function round 3 H(x,y,z)=(x^y^z)
+	add	w9, w9, w13               // Add constant 0xc4ac5665
+	add	w9, w9, w6                // Add aux function result
+	ror	w9, w9, #9                // Rotate left s=23 bits
+	movz	x6, #0x2244              // Load lower half of constant 0xf4292244
+	movk	x6, #0xf429, lsl #16     // Load upper half of constant 0xf4292244
+	add	w9, w8, w9                // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2])
+	add	w4, w4, w15               // Add dest value
+	orn	x13, x9, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w4, w4, w6                // Add constant 0xf4292244
+	eor	x6, x8, x13               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w4, w4, w6                // Add aux function result
+	ror	w4, w4, #26               // Rotate left s=6 bits
+	movz	x6, #0xff97              // Load lower half of constant 0x432aff97
+	movk	x6, #0x432a, lsl #16     // Load upper half of constant 0x432aff97
+	add	w4, w9, w4                // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0])
+	orn	x10, x4, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w17, w17, w23             // Add dest value
+	eor	x10, x9, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w17, w17, w6              // Add constant 0x432aff97
+	add	w6, w17, w10              // Add aux function result
+	ror	w6, w6, #22               // Rotate left s=10 bits
+	movz	x17, #0x23a7             // Load lower half of constant 0xab9423a7
+	movk	x17, #0xab94, lsl #16    // Load upper half of constant 0xab9423a7
+	add	w6, w4, w6                // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7])
+	add	w8, w8, w12               // Add dest value
+	orn	x10, x6, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w8, w17               // Add constant 0xab9423a7
+	eor	x17, x4, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w8, w17               // Add aux function result
+	ror	w8, w8, #17               // Rotate left s=15 bits
+	movz	x17, #0xa039             // Load lower half of constant 0xfc93a039
+	movk	x17, #0xfc93, lsl #16    // Load upper half of constant 0xfc93a039
+	add	w8, w6, w8                // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14])
+	orn	x13, x8, x4               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w9, w22               // Add dest value
+	eor	x13, x6, x13              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w9, w17               // Add constant 0xfc93a039
+	add	w17, w9, w13              // Add aux function result
+	ror	w17, w17, #11             // Rotate left s=21 bits
+	movz	x9, #0x59c3              // Load lower half of constant 0x655b59c3
+	movk	x9, #0x655b, lsl #16     // Load upper half of constant 0x655b59c3
+	add	w17, w8, w17              // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5])
+	add	w4, w4, w11               // Add dest value
+	orn	x13, x17, x6              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w4, w9                // Add constant 0x655b59c3
+	eor	x4, x8, x13               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w9, w4                // Add aux function result
+	ror	w9, w9, #26               // Rotate left s=6 bits
+	movz	x4, #0xcc92              // Load lower half of constant 0x8f0ccc92
+	movk	x4, #0x8f0c, lsl #16     // Load upper half of constant 0x8f0ccc92
+	add	w9, w17, w9               // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12])
+	orn	x10, x9, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w6, w6, w21               // Add dest value
+	eor	x10, x17, x10             // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w4, w6, w4                // Add constant 0x8f0ccc92
+	add	w6, w4, w10               // Add aux function result
+	ror	w6, w6, #22               // Rotate left s=10 bits
+	movz	x4, #0xf47d              // Load lower half of constant 0xffeff47d
+	movk	x4, #0xffef, lsl #16     // Load upper half of constant 0xffeff47d
+	add	w6, w9, w6                // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3])
+	add	w8, w8, w16               // Add dest value
+	orn	x10, x6, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w8, w4                // Add constant 0xffeff47d
+	eor	x4, x9, x10               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w8, w4                // Add aux function result
+	ror	w8, w8, #17               // Rotate left s=15 bits
+	movz	x4, #0x5dd1              // Load lower half of constant 0x85845dd1
+	movk	x4, #0x8584, lsl #16     // Load upper half of constant 0x85845dd1
+	add	w8, w6, w8                // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10])
+	orn	x10, x8, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w17, w20             // Add dest value
+	eor	x17, x6, x10              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w15, w4              // Add constant 0x85845dd1
+	add	w4, w15, w17              // Add aux function result
+	ror	w4, w4, #11               // Rotate left s=21 bits
+	movz	x15, #0x7e4f             // Load lower half of constant 0x6fa87e4f
+	movk	x15, #0x6fa8, lsl #16    // Load upper half of constant 0x6fa87e4f
+	add	w17, w8, w4               // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1])
+	add	w4, w9, w5                // Add dest value
+	orn	x9, x17, x6               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w4, w15              // Add constant 0x6fa87e4f
+	eor	x4, x8, x9                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w15, w4               // Add aux function result
+	ror	w9, w9, #26               // Rotate left s=6 bits
+	movz	x15, #0xe6e0             // Load lower half of constant 0xfe2ce6e0
+	movk	x15, #0xfe2c, lsl #16    // Load upper half of constant 0xfe2ce6e0
+	add	w4, w17, w9               // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8])
+	orn	x9, x4, x8                // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w6, w6, w27               // Add dest value
+	eor	x9, x17, x9               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w6, w15              // Add constant 0xfe2ce6e0
+	add	w6, w15, w9               // Add aux function result
+	ror	w6, w6, #22               // Rotate left s=10 bits
+	movz	x9, #0x4314              // Load lower half of constant 0xa3014314
+	movk	x9, #0xa301, lsl #16     // Load upper half of constant 0xa3014314
+	add	w15, w4, w6               // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15])
+	add	w6, w8, w7                // Add dest value
+	orn	x7, x15, x17              // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w6, w9                // Add constant 0xa3014314
+	eor	x9, x4, x7                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w6, w8, w9                // Add aux function result
+	ror	w6, w6, #17               // Rotate left s=15 bits
+	movz	x7, #0x11a1              // Load lower half of constant 0x4e0811a1
+	movk	x7, #0x4e08, lsl #16     // Load upper half of constant 0x4e0811a1
+	add	w8, w15, w6               // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6])
+	orn	x9, x8, x4                // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w6, w17, w26              // Add dest value
+	eor	x17, x15, x9              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w9, w6, w7                // Add constant 0x4e0811a1
+	add	w7, w9, w17               // Add aux function result
+	ror	w7, w7, #11               // Rotate left s=21 bits
+	movz	x6, #0x7e82              // Load lower half of constant 0xf7537e82
+	movk	x6, #0xf753, lsl #16     // Load upper half of constant 0xf7537e82
+	add	w9, w8, w7                // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13])
+	add	w17, w4, w14              // Add dest value
+	orn	x7, x9, x15               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w14, w17, w6              // Add constant 0xf7537e82
+	eor	x4, x8, x7                // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w17, w14, w4              // Add aux function result
+	ror	w17, w17, #26             // Rotate left s=6 bits
+	movz	x6, #0xf235              // Load lower half of constant 0xbd3af235
+	movk	x6, #0xbd3a, lsl #16     // Load upper half of constant 0xbd3af235
+	add	w7, w9, w17               // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4])
+	orn	x14, x7, x8               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w4, w15, w25              // Add dest value
+	eor	x17, x9, x14              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w4, w6               // Add constant 0xbd3af235
+	add	w16, w15, w17             // Add aux function result
+	ror	w16, w16, #22             // Rotate left s=10 bits
+	movz	x14, #0xd2bb             // Load lower half of constant 0x2ad7d2bb
+	movk	x14, #0x2ad7, lsl #16    // Load upper half of constant 0x2ad7d2bb
+	add	w4, w7, w16               // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11])
+	add	w6, w8, w3                // Add dest value
+	orn	x15, x4, x9               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w17, w6, w14              // Add constant 0x2ad7d2bb
+	eor	x16, x7, x15              // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w8, w17, w16              // Add aux function result
+	ror	w8, w8, #17               // Rotate left s=15 bits
+	movz	x3, #0xd391              // Load lower half of constant 0xeb86d391
+	movk	x3, #0xeb86, lsl #16     // Load upper half of constant 0xeb86d391
+	add	w14, w4, w8               // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2])
+	orn	x6, x14, x7               // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w15, w9, w24              // Add dest value
+	eor	x17, x4, x6               // End aux function round 4 I(x,y,z)=((~z|x)^y)
+	add	w16, w15, w3              // Add constant 0xeb86d391
+	add	w8, w16, w17              // Add aux function result
+	ror	w8, w8, #11               // Rotate left s=21 bits
+	ldp	w6, w15, [x0]             // Reload MD5 state->A and state->B
+	ldp	w5, w9, [x0, #8]          // Reload MD5 state->C and state->D
+	add	w3, w14, w8               // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9])
+	add	w13, w4, w9               // Add result of MD5 rounds to state->D
+	add	w12, w14, w5              // Add result of MD5 rounds to state->C
+	add	w10, w7, w6               // Add result of MD5 rounds to state->A
+	add	w11, w3, w15              // Add result of MD5 rounds to state->B
+	stp	w12, w13, [x0, #8]        // Store MD5 states C,D
+	stp	w10, w11, [x0]            // Store MD5 states A,B
+	add	x1, x1, #64               // Increment data pointer
+	subs	w2, w2, #1               // Decrement block counter
+	b.ne	Lmd5_blocks_loop
+
+	ldp	x21,x22,[sp,#16]
+.cfi_restore	x21
+.cfi_restore	x22
+	ldp	x23,x24,[sp,#32]
+.cfi_restore	x23
+.cfi_restore	x24
+	ldp	x25,x26,[sp,#48]
+.cfi_restore	x25
+.cfi_restore	x26
+	ldp	x27,x28,[sp,#64]
+.cfi_restore	x27
+.cfi_restore	x28
+	ldp	x19,x20,[sp],#80
+.cfi_restore	x19
+.cfi_restore	x20
+.cfi_def_cfa_offset	0
+	ret
+.cfi_endproc
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/p256-armv8-asm.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/p256-armv8-asm.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S
@@ -0,0 +1,309 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include "openssl/arm_arch.h"
+
+.text
+.globl	beeu_mod_inverse_vartime
+
+
+.align	4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_X:
+	tbz	x3, #0, Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+Lbeeu_shift_loop_Y:
+	tbz	x8, #0, Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	Lbeeu_shift_loop_Y
+
+Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	Lbeeu_loop
+
+Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	Lbeeu_loop
+
+Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	Lbeeu_finish
+
+Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/rndr-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/rndr-armv8.S
@@ -0,0 +1,39 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#include <openssl/asm_base.h>
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
+#include <openssl/arm_arch.h>
+
+.arch	armv8-a
+.text
+
+# int CRYPTO_rndr_multiple8(uint8_t *out, const size_t len)
+.globl	CRYPTO_rndr_multiple8
+
+.def CRYPTO_rndr_multiple8
+   .type 32
+.endef
+.align	4
+CRYPTO_rndr_multiple8:
+	cbz	x1, Lrndr_multiple8_error  // len = 0 is not supported
+
+Lrndr_multiple8_loop:
+	mrs	x2, s3_3_c2_c4_0             // rndr instruction https://developer.arm.com/documentation/ddi0601/2024-09/Index-by-Encoding
+	cbz	x2, Lrndr_multiple8_error   // Check if rndr failed
+
+	str	x2, [x0], #8               // Copy 8 bytes to *out and increment pointer by 8
+	sub	x1, x1, #8
+	cbz	x1, Lrndr_multiple8_done       // If multiple of 8 this will be 0 eventually
+	b	Lrndr_multiple8_loop
+
+Lrndr_multiple8_done:
+	mov	x0, #1                            // Return value success
+	ret
+
+Lrndr_multiple8_error:
+	mov	x0, #0                            // Return value error
+	ret
+
+#endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha1-armv8.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha256-armv8.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/sha512-armv8.S
--- a/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/vpaes-armv8.S
+++ b/vendor/aws-lc-sys/aws-lc/generated-src/win-aarch64/crypto/fipsmodule/vpaes-armv8.S