chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,867 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
.arch armv8-a+crypto
.section .rodata
.align 5
Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.text
.globl aes_hw_set_encrypt_key
.def aes_hw_set_encrypt_key
.type 32
.endef
.align 5
aes_hw_set_encrypt_key:
.cfi_startproc
Lenc_key:
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#3] // kFlag_aes_hw_set_encrypt_key
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
mov x3,#-1
cmp x0,#0
b.eq Lenc_key_abort
cmp x2,#0
b.eq Lenc_key_abort
mov x3,#-2
cmp w1,#128
b.lt Lenc_key_abort
cmp w1,#256
b.gt Lenc_key_abort
tst w1,#0x3f
b.ne Lenc_key_abort
adrp x3,Lrcon
add x3,x3,:lo12:Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
ld1 {v3.16b},[x0],#16
mov w1,#8 // reuse w1
ld1 {v1.4s,v2.4s},[x3],#32
b.lt Loop128
b.eq L192
b L256
.align 4
Loop128:
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
b.ne Loop128
ld1 {v1.4s},[x3]
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
add x2,x2,#0x50
mov w12,#10
b Ldone
.align 4
L192:
ld1 {v4.8b},[x0],#8
movi v6.16b,#8 // borrow v6.16b
st1 {v3.4s},[x2],#16
sub v2.16b,v2.16b,v6.16b // adjust the mask
Loop192:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.8b},[x2],#8
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
eor v4.16b,v4.16b,v6.16b
st1 {v3.4s},[x2],#16
b.ne Loop192
mov w12,#12
add x2,x2,#0x20
b Ldone
.align 4
L256:
ld1 {v4.16b},[x0]
mov w1,#7
mov w12,#14
st1 {v3.4s},[x2],#16
Loop256:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2],#16
b.eq Ldone
dup v6.4s,v3.s[3] // just splat
ext v5.16b,v0.16b,v4.16b,#12
aese v6.16b,v0.16b
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
eor v4.16b,v4.16b,v6.16b
b Loop256
Ldone:
str w12,[x2]
mov x3,#0
Lenc_key_abort:
mov x0,x3 // return value
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
.globl aes_hw_set_decrypt_key
.def aes_hw_set_decrypt_key
.type 32
.endef
.align 5
aes_hw_set_decrypt_key:
.cfi_startproc
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
bl Lenc_key
cmp x0,#0
b.ne Ldec_key_abort
sub x2,x2,#240 // restore original x2
mov x4,#-16
add x0,x2,x12,lsl#4 // end of key schedule
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
Loop_imc:
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
cmp x0,x2
b.hi Loop_imc
ld1 {v0.4s},[x2]
aesimc v0.16b,v0.16b
st1 {v0.4s},[x0]
eor x0,x0,x0 // return value
Ldec_key_abort:
ldp x29,x30,[sp],#16
.cfi_restore x29
.cfi_restore x30
.cfi_def_cfa_offset 0
AARCH64_VALIDATE_LINK_REGISTER
ret
.cfi_endproc
.globl aes_hw_encrypt
.def aes_hw_encrypt
.type 32
.endef
.align 5
aes_hw_encrypt:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#1] // kFlag_aes_hw_encrypt
#endif
AARCH64_VALID_CALL_TARGET
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
Loop_enc:
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
aesmc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt Loop_enc
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.cfi_endproc
.globl aes_hw_decrypt
.def aes_hw_decrypt
.type 32
.endef
.align 5
aes_hw_decrypt:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#1] // kFlag_aes_hw_encrypt
#endif
AARCH64_VALID_CALL_TARGET
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
Loop_dec:
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
aesimc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt Loop_dec
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.cfi_endproc
.globl aes_hw_cbc_encrypt
.def aes_hw_cbc_encrypt
.type 32
.endef
.align 5
aes_hw_cbc_encrypt:
.cfi_startproc
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
subs x2,x2,#16
mov x8,#16
b.lo Lcbc_abort
csel x8,xzr,x8,eq
cmp w5,#0 // en- or decrypting?
ldr w5,[x3,#240]
and x2,x2,#-16
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s,v19.4s},[x7],#32
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
b.eq Lcbc_dec
cmp w5,#2
eor v0.16b,v0.16b,v6.16b
eor v5.16b,v16.16b,v7.16b
b.eq Lcbc_enc128
ld1 {v2.4s,v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
add x14,x3,#16*6
add x3,x3,#16*7
b Lenter_cbc_enc
.align 4
Loop_cbc_enc:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
Lenter_cbc_enc:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x6]
cmp w5,#4
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x12]
b.eq Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x3]
nop
Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs Loop_cbc_enc
st1 {v6.16b},[x1],#16
b Lcbc_done
.align 5
Lcbc_enc128:
ld1 {v2.4s,v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b Lenter_cbc_enc128
Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs Loop_cbc_enc128
st1 {v6.16b},[x1],#16
b Lcbc_done
.align 5
Lcbc_dec:
ld1 {v18.16b},[x0],#16
subs x2,x2,#32 // bias
add w6,w5,#2
orr v3.16b,v0.16b,v0.16b
orr v1.16b,v0.16b,v0.16b
orr v19.16b,v18.16b,v18.16b
b.lo Lcbc_dec_tail
orr v1.16b,v18.16b,v18.16b
ld1 {v18.16b},[x0],#16
orr v2.16b,v0.16b,v0.16b
orr v3.16b,v1.16b,v1.16b
orr v19.16b,v18.16b,v18.16b
Loop3x_cbc_dec:
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt Loop3x_cbc_dec
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs Loop3x_cbc_dec
cmn x2,#0x30
b.eq Lcbc_done
nop
Lcbc_dec_tail:
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt Lcbc_dec_tail
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b Lcbc_done
Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
Lcbc_done:
st1 {v6.16b},[x4]
Lcbc_abort:
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
.globl aes_hw_ctr32_encrypt_blocks
.def aes_hw_ctr32_encrypt_blocks
.type 32
.endef
.align 5
aes_hw_ctr32_encrypt_blocks:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9] // kFlag_aes_hw_ctr32_encrypt_blocks
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
ldr w5,[x3,#240]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#4
mov x12,#16
cmp x2,#2
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
sub w5,w5,#2
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
// affected by silicon errata #1742098 [0] and #1655431 [1],
// respectively, where the second instruction of an aese/aesmc
// instruction pair may execute twice if an interrupt is taken right
// after the first instruction consumes an input register of which a
// single 32-bit lane has been updated the last time it was modified.
//
// This function uses a counter in one 32-bit lane. The vmov lines
// could write to v1.16b and v18.16b directly, but that trips this bugs.
// We write to v6.16b and copy to the final register as a workaround.
//
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __AARCH64EB__
rev w8, w8
#endif
add w10, w8, #1
orr v6.16b,v0.16b,v0.16b
rev w10, w10
mov v6.s[3],w10
add w8, w8, #2
orr v1.16b,v6.16b,v6.16b
b.ls Lctr32_tail
rev w12, w8
mov v6.s[3],w12
sub x2,x2,#3 // bias
orr v18.16b,v6.16b,v6.16b
b Loop3x_ctr32
.align 4
Loop3x_ctr32:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt Loop3x_ctr32
aese v0.16b,v16.16b
aesmc v4.16b,v0.16b
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
ld1 {v2.16b},[x0],#16
add w9,w8,#1
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
rev w9,w9
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aesmc v5.16b,v5.16b
// Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
// 32-bit mode. See the comment above.
eor v19.16b,v19.16b,v7.16b
mov v6.s[3], w9
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
orr v0.16b,v6.16b,v6.16b
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
mov v6.s[3], w10
rev w12,w8
aese v5.16b,v22.16b
aesmc v5.16b,v5.16b
orr v1.16b,v6.16b,v6.16b
mov v6.s[3], w12
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
orr v18.16b,v6.16b,v6.16b
subs x2,x2,#3
aese v4.16b,v23.16b
aese v5.16b,v23.16b
aese v17.16b,v23.16b
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v19.16b},[x1],#16
b.hs Loop3x_ctr32
adds x2,x2,#3
b.eq Lctr32_done
Lctr32_tail:
cmp x2,#1
b.lt Lctr32_done // if len = 0, go to done
mov x12,#16
csel x12,xzr,x12,eq
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
aese v1.16b,v23.16b
eor v2.16b,v2.16b,v0.16b
eor v3.16b,v3.16b,v1.16b
st1 {v2.16b},[x1],#16
cbz x12,Lctr32_done // if step = 0 (len = 1), go to done
st1 {v3.16b},[x1]
Lctr32_done:
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
#endif
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,93 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
.text
// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.globl bn_add_words
.align 4
bn_add_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Clear the carry flag.
cmn xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, Ladd_tail
Ladd_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
adcs x4, x4, x6
adcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, Ladd_loop
Ladd_tail:
cbz x3, Ladd_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
adcs x4, x4, x6
str x4, [x0], #8
Ladd_exit:
cset x0, cs
ret
.cfi_endproc
// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.globl bn_sub_words
.align 4
bn_sub_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
# so we want C = 1 here.
cmp xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, Lsub_tail
Lsub_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
sbcs x4, x4, x6
sbcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, Lsub_loop
Lsub_tail:
cbz x3, Lsub_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
sbcs x4, x4, x6
str x4, [x0], #8
Lsub_exit:
cset x0, cc
ret
.cfi_endproc
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

View File

@@ -0,0 +1,341 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
.text
.globl gcm_init_neon
.def gcm_init_neon
.type 32
.endef
.align 4
gcm_init_neon:
AARCH64_VALID_CALL_TARGET
// This function is adapted from gcm_init_v8. xC2 is t3.
ld1 {v17.2d}, [x1] // load H
movi v19.16b, #0xe1
shl v19.2d, v19.2d, #57 // 0xc2.0
ext v3.16b, v17.16b, v17.16b, #8
ushr v18.2d, v19.2d, #63
dup v17.4s, v17.s[1]
ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
ushr v18.2d, v3.2d, #63
sshr v17.4s, v17.4s, #31 // broadcast carry bit
and v18.16b, v18.16b, v16.16b
shl v3.2d, v3.2d, #1
ext v18.16b, v18.16b, v18.16b, #8
and v16.16b, v16.16b, v17.16b
orr v3.16b, v3.16b, v18.16b // H<<<=1
eor v5.16b, v3.16b, v16.16b // twisted H
st1 {v5.2d}, [x0] // store Htable[0]
ret
.globl gcm_gmult_neon
.def gcm_gmult_neon
.type 32
.endef
.align 4
gcm_gmult_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v3.16b}, [x0] // load Xi
ld1 {v5.1d}, [x1], #8 // load twisted H
ld1 {v6.1d}, [x1]
adrp x9, Lmasks // load constants
add x9, x9, :lo12:Lmasks
ld1 {v24.2d, v25.2d}, [x9]
rev64 v3.16b, v3.16b // byteswap Xi
ext v3.16b, v3.16b, v3.16b, #8
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
mov x3, #16
b Lgmult_neon
.globl gcm_ghash_neon
.def gcm_ghash_neon
.type 32
.endef
.align 4
gcm_ghash_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v0.16b}, [x0] // load Xi
ld1 {v5.1d}, [x1], #8 // load twisted H
ld1 {v6.1d}, [x1]
adrp x9, Lmasks // load constants
add x9, x9, :lo12:Lmasks
ld1 {v24.2d, v25.2d}, [x9]
rev64 v0.16b, v0.16b // byteswap Xi
ext v0.16b, v0.16b, v0.16b, #8
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
Loop_neon:
ld1 {v3.16b}, [x2], #16 // load inp
rev64 v3.16b, v3.16b // byteswap inp
ext v3.16b, v3.16b, v3.16b, #8
eor v3.16b, v3.16b, v0.16b // inp ^= Xi
Lgmult_neon:
// Split the input into v3 and v4. (The upper halves are unused,
// so it is okay to leave them alone.)
ins v4.d[0], v3.d[1]
ext v16.8b, v5.8b, v5.8b, #1 // A1
pmull v16.8h, v16.8b, v3.8b // F = A1*B
ext v0.8b, v3.8b, v3.8b, #1 // B1
pmull v0.8h, v5.8b, v0.8b // E = A*B1
ext v17.8b, v5.8b, v5.8b, #2 // A2
pmull v17.8h, v17.8b, v3.8b // H = A2*B
ext v19.8b, v3.8b, v3.8b, #2 // B2
pmull v19.8h, v5.8b, v19.8b // G = A*B2
ext v18.8b, v5.8b, v5.8b, #3 // A3
eor v16.16b, v16.16b, v0.16b // L = E + F
pmull v18.8h, v18.8b, v3.8b // J = A3*B
ext v0.8b, v3.8b, v3.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v0.8h, v5.8b, v0.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v3.8b, v3.8b, #4 // B4
eor v18.16b, v18.16b, v0.16b // N = I + J
pmull v19.8h, v5.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v0.8h, v5.8b, v3.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v0.16b, v0.16b, v16.16b
eor v0.16b, v0.16b, v18.16b
eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
ext v16.8b, v7.8b, v7.8b, #1 // A1
pmull v16.8h, v16.8b, v3.8b // F = A1*B
ext v1.8b, v3.8b, v3.8b, #1 // B1
pmull v1.8h, v7.8b, v1.8b // E = A*B1
ext v17.8b, v7.8b, v7.8b, #2 // A2
pmull v17.8h, v17.8b, v3.8b // H = A2*B
ext v19.8b, v3.8b, v3.8b, #2 // B2
pmull v19.8h, v7.8b, v19.8b // G = A*B2
ext v18.8b, v7.8b, v7.8b, #3 // A3
eor v16.16b, v16.16b, v1.16b // L = E + F
pmull v18.8h, v18.8b, v3.8b // J = A3*B
ext v1.8b, v3.8b, v3.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v1.8h, v7.8b, v1.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v3.8b, v3.8b, #4 // B4
eor v18.16b, v18.16b, v1.16b // N = I + J
pmull v19.8h, v7.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v1.8h, v7.8b, v3.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v1.16b, v1.16b, v16.16b
eor v1.16b, v1.16b, v18.16b
ext v16.8b, v6.8b, v6.8b, #1 // A1
pmull v16.8h, v16.8b, v4.8b // F = A1*B
ext v2.8b, v4.8b, v4.8b, #1 // B1
pmull v2.8h, v6.8b, v2.8b // E = A*B1
ext v17.8b, v6.8b, v6.8b, #2 // A2
pmull v17.8h, v17.8b, v4.8b // H = A2*B
ext v19.8b, v4.8b, v4.8b, #2 // B2
pmull v19.8h, v6.8b, v19.8b // G = A*B2
ext v18.8b, v6.8b, v6.8b, #3 // A3
eor v16.16b, v16.16b, v2.16b // L = E + F
pmull v18.8h, v18.8b, v4.8b // J = A3*B
ext v2.8b, v4.8b, v4.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v2.8h, v6.8b, v2.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v4.8b, v4.8b, #4 // B4
eor v18.16b, v18.16b, v2.16b // N = I + J
pmull v19.8h, v6.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v2.8h, v6.8b, v4.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v2.16b, v2.16b, v16.16b
eor v2.16b, v2.16b, v18.16b
ext v16.16b, v0.16b, v2.16b, #8
eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
eor v1.16b, v1.16b, v2.16b
eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
// This is a no-op due to the ins instruction below.
// ins v2.d[0], v1.d[1]
// equivalent of reduction_avx from ghash-x86_64.pl
shl v17.2d, v0.2d, #57 // 1st phase
shl v18.2d, v0.2d, #62
eor v18.16b, v18.16b, v17.16b //
shl v17.2d, v0.2d, #63
eor v18.16b, v18.16b, v17.16b //
// Note Xm contains {Xl.d[1], Xh.d[0]}.
eor v18.16b, v18.16b, v1.16b
ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
ushr v18.2d, v0.2d, #1 // 2nd phase
eor v2.16b, v2.16b,v0.16b
eor v0.16b, v0.16b,v18.16b //
ushr v18.2d, v18.2d, #6
ushr v0.2d, v0.2d, #1 //
eor v0.16b, v0.16b, v2.16b //
eor v0.16b, v0.16b, v18.16b //
subs x3, x3, #16
bne Loop_neon
rev64 v0.16b, v0.16b // byteswap Xi and write
ext v0.16b, v0.16b, v0.16b, #8
st1 {v0.16b}, [x0]
ret
.section .rodata
.align 4
Lmasks:
.quad 0x0000ffffffffffff // k48
.quad 0x00000000ffffffff // k32
.quad 0x000000000000ffff // k16
.quad 0x0000000000000000 // k0
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

View File

@@ -0,0 +1,673 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
.arch armv8-a+crypto
.globl gcm_init_v8
.def gcm_init_v8
.type 32
.endef
.align 4
gcm_init_v8:
AARCH64_VALID_CALL_TARGET
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
ext v20.16b, v20.16b, v20.16b, #8
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull2 v0.1q,v20.2d,v20.2d
eor v16.16b,v16.16b,v20.16b
pmull v2.1q,v20.1d,v20.1d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v17.16b,v0.16b,v18.16b
ext v22.16b,v17.16b,v17.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d},[x0],#16 //store Htable[1..2]
st1 {v22.2d},[x0],#16 //store Htable[1..2]
//calculate H^3 and H^4
pmull2 v0.1q,v20.2d, v22.2d
pmull2 v5.1q,v22.2d,v22.2d
pmull v2.1q,v20.1d, v22.1d
pmull v7.1q,v22.1d,v22.1d
pmull v1.1q,v16.1d,v17.1d
pmull v6.1q,v17.1d,v17.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b, v0.16b,v18.16b //H^3
eor v17.16b, v5.16b,v4.16b //H^4
ext v23.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing
ext v25.16b,v17.16b,v17.16b,#8
ext v18.16b,v22.16b,v22.16b,#8
eor v16.16b,v16.16b,v23.16b
eor v17.16b,v17.16b,v25.16b
eor v18.16b,v18.16b,v22.16b
ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5]
//calculate H^5 and H^6
pmull2 v0.1q,v22.2d, v23.2d
pmull2 v5.1q,v23.2d,v23.2d
pmull v2.1q,v22.1d, v23.1d
pmull v7.1q,v23.1d,v23.1d
pmull v1.1q,v16.1d,v18.1d
pmull v6.1q,v16.1d,v16.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b,v0.16b,v18.16b //H^5
eor v17.16b,v5.16b,v4.16b //H^6
ext v26.16b, v16.16b, v16.16b,#8 //Karatsuba pre-processing
ext v28.16b, v17.16b, v17.16b,#8
ext v18.16b,v22.16b,v22.16b,#8
eor v16.16b,v16.16b,v26.16b
eor v17.16b,v17.16b,v28.16b
eor v18.16b,v18.16b,v22.16b
ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8]
//calculate H^7 and H^8
pmull2 v0.1q,v22.2d,v26.2d
pmull2 v5.1q,v22.2d,v28.2d
pmull v2.1q,v22.1d,v26.1d
pmull v7.1q,v22.1d,v28.1d
pmull v1.1q,v16.1d,v18.1d
pmull v6.1q,v17.1d,v18.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b,v0.16b,v18.16b //H^7
eor v17.16b,v5.16b,v4.16b //H^8
ext v29.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing
ext v31.16b,v17.16b,v17.16b,#8
eor v16.16b,v16.16b,v29.16b
eor v17.16b,v17.16b,v31.16b
ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11]
ret
.globl gcm_gmult_v8
.def gcm_gmult_v8
.type 32
.endef
.align 4
gcm_gmult_v8:
AARCH64_VALID_CALL_TARGET
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
ext v20.16b,v20.16b,v20.16b,#8
shl v19.2d,v19.2d,#57
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v3.16b,v17.16b,v17.16b,#8
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.globl gcm_ghash_v8
.def gcm_ghash_v8
.type 32
.endef
.align 4
gcm_ghash_v8:
AARCH64_VALID_CALL_TARGET
cmp x3,#64
b.hs Lgcm_ghash_v8_4x
ld1 {v0.2d},[x0] //load [rotated] Xi
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//algorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude overstepping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
ext v20.16b,v20.16b,v20.16b,#8
movi v19.16b,#0xe1
ld1 {v22.2d},[x1]
ext v22.16b,v22.16b,v22.16b,#8
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b Loop_mod2x_v8
.align 4
Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq Ldone_v8 //is x3 zero?
Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
Ldone_v8:
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.def gcm_ghash_v8_4x
.type 32
.endef
.align 4
gcm_ghash_v8_4x:
Lgcm_ghash_v8_4x:
ld1 {v0.2d},[x0] //load [rotated] Xi
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
ext v20.16b,v20.16b,v20.16b,#8
ext v22.16b,v22.16b,v22.16b,#8
movi v19.16b,#0xe1
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
ext v26.16b,v26.16b,v26.16b,#8
ext v28.16b,v28.16b,v28.16b,#8
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
ext v25.16b,v7.16b,v7.16b,#8
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
pmull2 v31.1q,v20.2d,v25.2d
pmull v30.1q,v21.1d,v7.1d
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#128
b.lo Ltail4x
b Loop4x
.align 4
Loop4x:
eor v16.16b,v4.16b,v0.16b
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
ext v3.16b,v16.16b,v16.16b,#8
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
ext v25.16b,v7.16b,v7.16b,#8
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
ext v24.16b,v6.16b,v6.16b,#8
eor v1.16b,v1.16b,v30.16b
ext v23.16b,v5.16b,v5.16b,#8
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
eor v1.16b,v1.16b,v17.16b
pmull2 v31.1q,v20.2d,v25.2d
eor v1.16b,v1.16b,v18.16b
pmull v30.1q,v21.1d,v7.1d
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
eor v0.16b,v1.16b,v18.16b
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v18.16b,v18.16b,v2.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v0.16b,v0.16b,v18.16b
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#64
b.hs Loop4x
Ltail4x:
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
adds x3,x3,#64
b.eq Ldone4x
cmp x3,#32
b.lo Lone
b.eq Ltwo
Lthree:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d,v6.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
eor v6.16b,v6.16b,v24.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
pmull2 v31.1q,v20.2d,v24.2d
pmull v30.1q,v21.1d,v6.1d
eor v0.16b,v0.16b,v18.16b
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
eor v5.16b,v5.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull2 v23.1q,v22.2d,v23.2d
eor v16.16b,v4.16b,v0.16b
pmull2 v5.1q,v21.2d,v5.2d
ext v3.16b,v16.16b,v16.16b,#8
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v26.2d,v3.2d
pmull v1.1q,v27.1d,v16.1d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b Ldone4x
.align 4
Ltwo:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull2 v31.1q,v20.2d,v23.2d
pmull v30.1q,v21.1d,v5.1d
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v22.2d,v3.2d
pmull2 v1.1q,v21.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b Ldone4x
.align 4
Lone:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v20.1d,v3.1d
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v20.2d,v3.2d
pmull v1.1q,v21.1d,v16.1d
Ldone4x:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
st1 {v0.2d},[x0] //write out Xi
ret
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

View File

@@ -0,0 +1,251 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
.text
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
.quad 0,0,0,0,0,0,0,0
iotas_hw:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.def KeccakF1600_int
.type 32
.endef
.align 5
KeccakF1600_int:
AARCH64_SIGN_LINK_REGISTER
adr x28,iotas_hw
stp x28,x30,[sp,#16] // 32 bytes on top are mine
b Loop
.align 4
Loop:
////////////////////////////////////////// Theta
eor x26,x0,x5
stp x4,x9,[sp,#0] // offload pair...
eor x27,x1,x6
eor x28,x2,x7
eor x30,x3,x8
eor x4,x4,x9
eor x26,x26,x10
eor x27,x27,x11
eor x28,x28,x12
eor x30,x30,x13
eor x4,x4,x14
eor x26,x26,x15
eor x27,x27,x16
eor x28,x28,x17
eor x30,x30,x25
eor x4,x4,x19
eor x26,x26,x20
eor x28,x28,x22
eor x27,x27,x21
eor x30,x30,x23
eor x4,x4,x24
eor x9,x26,x28,ror#63
eor x1,x1,x9
eor x6,x6,x9
eor x11,x11,x9
eor x16,x16,x9
eor x21,x21,x9
eor x9,x27,x30,ror#63
eor x28,x28,x4,ror#63
eor x30,x30,x26,ror#63
eor x4,x4,x27,ror#63
eor x27, x2,x9 // mov x27,x2
eor x7,x7,x9
eor x12,x12,x9
eor x17,x17,x9
eor x22,x22,x9
eor x0,x0,x4
eor x5,x5,x4
eor x10,x10,x4
eor x15,x15,x4
eor x20,x20,x4
ldp x4,x9,[sp,#0] // re-load offloaded data
eor x26, x3,x28 // mov x26,x3
eor x8,x8,x28
eor x13,x13,x28
eor x25,x25,x28
eor x23,x23,x28
eor x28, x4,x30 // mov x28,x4
eor x9,x9,x30
eor x14,x14,x30
eor x19,x19,x30
eor x24,x24,x30
////////////////////////////////////////// Rho+Pi
mov x30,x1
ror x1,x6,#20
//mov x27,x2
ror x2,x12,#21
//mov x26,x3
ror x3,x25,#43
//mov x28,x4
ror x4,x24,#50
ror x6,x9,#44
ror x12,x13,#39
ror x25,x17,#49
ror x24,x21,#62
ror x9,x22,#3
ror x13,x19,#56
ror x17,x11,#54
ror x21,x8,#9
ror x22,x14,#25
ror x19,x23,#8
ror x11,x7,#58
ror x8,x16,#19
ror x14,x20,#46
ror x23,x15,#23
ror x7,x10,#61
ror x16,x5,#28
ror x5,x26,#36
ror x10,x30,#63
ror x15,x28,#37
ror x20,x27,#2
////////////////////////////////////////// Chi+Iota
bic x26,x2,x1
bic x27,x3,x2
bic x28,x0,x4
bic x30,x1,x0
eor x0,x0,x26
bic x26,x4,x3
eor x1,x1,x27
ldr x27,[sp,#16]
eor x3,x3,x28
eor x4,x4,x30
eor x2,x2,x26
ldr x30,[x27],#8 // Iota[i++]
bic x26,x7,x6
tst x27,#255 // are we done?
str x27,[sp,#16]
bic x27,x8,x7
bic x28,x5,x9
eor x0,x0,x30 // A[0][0] ^= Iota
bic x30,x6,x5
eor x5,x5,x26
bic x26,x9,x8
eor x6,x6,x27
eor x8,x8,x28
eor x9,x9,x30
eor x7,x7,x26
bic x26,x12,x11
bic x27,x13,x12
bic x28,x10,x14
bic x30,x11,x10
eor x10,x10,x26
bic x26,x14,x13
eor x11,x11,x27
eor x13,x13,x28
eor x14,x14,x30
eor x12,x12,x26
bic x26,x17,x16
bic x27,x25,x17
bic x28,x15,x19
bic x30,x16,x15
eor x15,x15,x26
bic x26,x19,x25
eor x16,x16,x27
eor x25,x25,x28
eor x19,x19,x30
eor x17,x17,x26
bic x26,x22,x21
bic x27,x23,x22
bic x28,x20,x24
bic x30,x21,x20
eor x20,x20,x26
bic x26,x24,x23
eor x21,x21,x27
eor x23,x23,x28
eor x24,x24,x30
eor x22,x22,x26
bne Loop
ldr x30,[sp,#24]
AARCH64_VALIDATE_LINK_REGISTER
ret
.globl KeccakF1600_hw
.def KeccakF1600_hw
.type 32
.endef
.align 5
KeccakF1600_hw:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#48
str x0,[sp,#32] // offload argument
mov x26,x0
ldp x0,x1,[x0,#16*0]
ldp x2,x3,[x26,#16*1]
ldp x4,x5,[x26,#16*2]
ldp x6,x7,[x26,#16*3]
ldp x8,x9,[x26,#16*4]
ldp x10,x11,[x26,#16*5]
ldp x12,x13,[x26,#16*6]
ldp x14,x15,[x26,#16*7]
ldp x16,x17,[x26,#16*8]
ldp x25,x19,[x26,#16*9]
ldp x20,x21,[x26,#16*10]
ldp x22,x23,[x26,#16*11]
ldr x24,[x26,#16*12]
bl KeccakF1600_int
ldr x26,[sp,#32]
stp x0,x1,[x26,#16*0]
stp x2,x3,[x26,#16*1]
stp x4,x5,[x26,#16*2]
stp x6,x7,[x26,#16*3]
stp x8,x9,[x26,#16*4]
stp x10,x11,[x26,#16*5]
stp x12,x13,[x26,#16*6]
stp x14,x15,[x26,#16*7]
stp x16,x17,[x26,#16*8]
stp x25,x19,[x26,#16*9]
stp x20,x21,[x26,#16*10]
stp x22,x23,[x26,#16*11]
str x24,[x26,#16*12]
ldp x19,x20,[x29,#16]
add sp,sp,#48
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.byte 75,101,99,99,97,107,45,49,54,48,48,32,112,101,114,109,117,116,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

View File

@@ -0,0 +1,687 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
.text
.globl md5_block_asm_data_order
md5_block_asm_data_order:
.cfi_startproc
// Save all callee-saved registers
stp x19,x20,[sp,#-80]!
.cfi_def_cfa_offset 80
.cfi_offset x19, -80
.cfi_offset x20, -72
stp x21,x22,[sp,#16]
.cfi_offset x21, -64
.cfi_offset x22, -56
stp x23,x24,[sp,#32]
.cfi_offset x23, -48
.cfi_offset x24, -40
stp x25,x26,[sp,#48]
.cfi_offset x25, -32
.cfi_offset x26, -24
stp x27,x28,[sp,#64]
.cfi_offset x27, -16
.cfi_offset x28, -8
ldp w10, w11, [x0, #0] // Load MD5 state->A and state->B
ldp w12, w13, [x0, #8] // Load MD5 state->C and state->D
.align 5
Lmd5_blocks_loop:
eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
ldp x15, x3, [x1] // Load 4 words of input data0 M[0]/0
eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0xa478 // Load lower half of constant 0xd76aa478
movk x9, #0xd76a, lsl #16 // Load upper half of constant 0xd76aa478
add w8, w10, w15 // Add dest value
add w7, w8, w9 // Add constant 0xd76aa478
add w6, w7, w14 // Add aux function result
ror w6, w6, #25 // Rotate left s=7 bits
eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0])
and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0xb756 // Load lower half of constant 0xe8c7b756
movk x16, #0xe8c7, lsl #16 // Load upper half of constant 0xe8c7b756
lsr x20, x15, #32 // Right shift high input value containing M[1]
add w9, w13, w20 // Add dest value
add w7, w9, w16 // Add constant 0xe8c7b756
add w14, w7, w17 // Add aux function result
ror w14, w14, #20 // Rotate left s=12 bits
eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1])
and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0x70db // Load lower half of constant 0x242070db
movk x16, #0x2420, lsl #16 // Load upper half of constant 0x242070db
add w7, w12, w3 // Add dest value
add w17, w7, w16 // Add constant 0x242070db
add w14, w17, w9 // Add aux function result
ror w14, w14, #15 // Rotate left s=17 bits
eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2])
and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0xceee // Load lower half of constant 0xc1bdceee
movk x9, #0xc1bd, lsl #16 // Load upper half of constant 0xc1bdceee
lsr x21, x3, #32 // Right shift high input value containing M[3]
add w14, w11, w21 // Add dest value
add w6, w14, w9 // Add constant 0xc1bdceee
add w7, w6, w16 // Add aux function result
ror w7, w7, #10 // Rotate left s=22 bits
eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3])
ldp x14, x7, [x1, #16] // Load 4 words of input data0 M[4]/0w
and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0xfaf // Load lower half of constant 0xf57c0faf
movk x16, #0xf57c, lsl #16 // Load upper half of constant 0xf57c0faf
add w17, w4, w14 // Add dest value
add w16, w17, w16 // Add constant 0xf57c0faf
add w4, w16, w6 // Add aux function result
ror w4, w4, #25 // Rotate left s=7 bits
eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4])
and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x4, #0xc62a // Load lower half of constant 0x4787c62a
movk x4, #0x4787, lsl #16 // Load upper half of constant 0x4787c62a
lsr x22, x14, #32 // Right shift high input value containing M[5]
add w16, w5, w22 // Add dest value
add w16, w16, w4 // Add constant 0x4787c62a
add w5, w16, w6 // Add aux function result
ror w5, w5, #20 // Rotate left s=12 bits
eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5])
and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x4, #0x4613 // Load lower half of constant 0xa8304613
movk x4, #0xa830, lsl #16 // Load upper half of constant 0xa8304613
add w6, w8, w7 // Add dest value
add w8, w6, w4 // Add constant 0xa8304613
add w4, w8, w5 // Add aux function result
ror w4, w4, #15 // Rotate left s=17 bits
eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6])
and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x6, #0x9501 // Load lower half of constant 0xfd469501
movk x6, #0xfd46, lsl #16 // Load upper half of constant 0xfd469501
lsr x23, x7, #32 // Right shift high input value containing M[7]
add w9, w9, w23 // Add dest value
add w5, w9, w6 // Add constant 0xfd469501
add w9, w5, w4 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7])
ldp x5, x16, [x1, #32] // Load 4 words of input data0 M[8]/0
and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0x98d8 // Load lower half of constant 0x698098d8
movk x9, #0x6980, lsl #16 // Load upper half of constant 0x698098d8
add w17, w17, w5 // Add dest value
add w9, w17, w9 // Add constant 0x698098d8
add w17, w9, w6 // Add aux function result
ror w17, w17, #25 // Rotate left s=7 bits
eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8])
and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x17, #0xf7af // Load lower half of constant 0x8b44f7af
movk x17, #0x8b44, lsl #16 // Load upper half of constant 0x8b44f7af
lsr x24, x5, #32 // Right shift high input value containing M[9]
add w19, w19, w24 // Add dest value
add w17, w19, w17 // Add constant 0x8b44f7af
add w19, w17, w9 // Add aux function result
ror w19, w19, #20 // Rotate left s=12 bits
eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9])
and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x11, #0x5bb1 // Load lower half of constant 0xffff5bb1
movk x11, #0xffff, lsl #16 // Load upper half of constant 0xffff5bb1
add w8, w8, w16 // Add dest value
add w8, w8, w11 // Add constant 0xffff5bb1
add w8, w8, w9 // Add aux function result
ror w8, w8, #15 // Rotate left s=17 bits
eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10])
and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x11, #0xd7be // Load lower half of constant 0x895cd7be
movk x11, #0x895c, lsl #16 // Load upper half of constant 0x895cd7be
lsr x25, x16, #32 // Right shift high input value containing M[11]
add w4, w4, w25 // Add dest value
add w4, w4, w11 // Add constant 0x895cd7be
add w9, w4, w9 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11])
ldp x11, x12, [x1, #48] // Load 4 words of input data0 M[12]/0
and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x19, #0x1122 // Load lower half of constant 0x6b901122
movk x19, #0x6b90, lsl #16 // Load upper half of constant 0x6b901122
add w6, w6, w11 // Add dest value
add w6, w6, w19 // Add constant 0x6b901122
add w4, w6, w4 // Add aux function result
ror w4, w4, #25 // Rotate left s=7 bits
eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12])
and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x19, #0x7193 // Load lower half of constant 0xfd987193
movk x19, #0xfd98, lsl #16 // Load upper half of constant 0xfd987193
lsr x26, x11, #32 // Right shift high input value containing M[13]
add w17, w17, w26 // Add dest value
add w17, w17, w19 // Add constant 0xfd987193
add w17, w17, w6 // Add aux function result
ror w17, w17, #20 // Rotate left s=12 bits
eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13])
and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x13, #0x438e // Load lower half of constant 0xa679438e
movk x13, #0xa679, lsl #16 // Load upper half of constant 0xa679438e
add w8, w8, w12 // Add dest value
add w8, w8, w13 // Add constant 0xa679438e
add w8, w8, w6 // Add aux function result
ror w8, w8, #15 // Rotate left s=17 bits
eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14])
and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x13, #0x821 // Load lower half of constant 0x49b40821
movk x13, #0x49b4, lsl #16 // Load upper half of constant 0x49b40821
lsr x27, x12, #32 // Right shift high input value containing M[15]
add w9, w9, w27 // Add dest value
add w9, w9, w13 // Add constant 0x49b40821
add w9, w9, w6 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
movz x13, #0x2562 // Load lower half of constant 0xf61e2562
movk x13, #0xf61e, lsl #16 // Load upper half of constant 0xf61e2562
add w4, w4, w20 // Add dest value
add w4, w4, w13 // Add constant 0xf61e2562
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
movz x13, #0xb340 // Load lower half of constant 0xc040b340
movk x13, #0xc040, lsl #16 // Load upper half of constant 0xc040b340
add w17, w17, w7 // Add dest value
add w17, w17, w13 // Add constant 0xc040b340
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
movz x13, #0x5a51 // Load lower half of constant 0x265e5a51
movk x13, #0x265e, lsl #16 // Load upper half of constant 0x265e5a51
add w8, w8, w25 // Add dest value
add w8, w8, w13 // Add constant 0x265e5a51
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
movz x13, #0xc7aa // Load lower half of constant 0xe9b6c7aa
movk x13, #0xe9b6, lsl #16 // Load upper half of constant 0xe9b6c7aa
add w9, w9, w15 // Add dest value
add w9, w9, w13 // Add constant 0xe9b6c7aa
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
movz x13, #0x105d // Load lower half of constant 0xd62f105d
movk x13, #0xd62f, lsl #16 // Load upper half of constant 0xd62f105d
add w4, w4, w22 // Add dest value
add w4, w4, w13 // Add constant 0xd62f105d
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
movz x13, #0x1453 // Load lower half of constant 0x2441453
movk x13, #0x244, lsl #16 // Load upper half of constant 0x2441453
add w17, w17, w16 // Add dest value
add w17, w17, w13 // Add constant 0x2441453
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
movz x13, #0xe681 // Load lower half of constant 0xd8a1e681
movk x13, #0xd8a1, lsl #16 // Load upper half of constant 0xd8a1e681
add w8, w8, w27 // Add dest value
add w8, w8, w13 // Add constant 0xd8a1e681
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
movz x13, #0xfbc8 // Load lower half of constant 0xe7d3fbc8
movk x13, #0xe7d3, lsl #16 // Load upper half of constant 0xe7d3fbc8
add w9, w9, w14 // Add dest value
add w9, w9, w13 // Add constant 0xe7d3fbc8
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
movz x13, #0xcde6 // Load lower half of constant 0x21e1cde6
movk x13, #0x21e1, lsl #16 // Load upper half of constant 0x21e1cde6
add w4, w4, w24 // Add dest value
add w4, w4, w13 // Add constant 0x21e1cde6
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
movz x13, #0x7d6 // Load lower half of constant 0xc33707d6
movk x13, #0xc337, lsl #16 // Load upper half of constant 0xc33707d6
add w17, w17, w12 // Add dest value
add w17, w17, w13 // Add constant 0xc33707d6
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
movz x13, #0xd87 // Load lower half of constant 0xf4d50d87
movk x13, #0xf4d5, lsl #16 // Load upper half of constant 0xf4d50d87
add w8, w8, w21 // Add dest value
add w8, w8, w13 // Add constant 0xf4d50d87
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
movz x13, #0x14ed // Load lower half of constant 0x455a14ed
movk x13, #0x455a, lsl #16 // Load upper half of constant 0x455a14ed
add w9, w9, w5 // Add dest value
add w9, w9, w13 // Add constant 0x455a14ed
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
movz x13, #0xe905 // Load lower half of constant 0xa9e3e905
movk x13, #0xa9e3, lsl #16 // Load upper half of constant 0xa9e3e905
add w4, w4, w26 // Add dest value
add w4, w4, w13 // Add constant 0xa9e3e905
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
movz x13, #0xa3f8 // Load lower half of constant 0xfcefa3f8
movk x13, #0xfcef, lsl #16 // Load upper half of constant 0xfcefa3f8
add w17, w17, w3 // Add dest value
add w17, w17, w13 // Add constant 0xfcefa3f8
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
movz x13, #0x2d9 // Load lower half of constant 0x676f02d9
movk x13, #0x676f, lsl #16 // Load upper half of constant 0x676f02d9
add w8, w8, w23 // Add dest value
add w8, w8, w13 // Add constant 0x676f02d9
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
movz x13, #0x4c8a // Load lower half of constant 0x8d2a4c8a
movk x13, #0x8d2a, lsl #16 // Load upper half of constant 0x8d2a4c8a
add w9, w9, w11 // Add dest value
add w9, w9, w13 // Add constant 0x8d2a4c8a
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #12 // Rotate left s=20 bits
movz x10, #0x3942 // Load lower half of constant 0xfffa3942
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12])
movk x10, #0xfffa, lsl #16 // Load upper half of constant 0xfffa3942
add w4, w4, w22 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xfffa3942
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0xf681 // Load lower half of constant 0x8771f681
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5])
movk x10, #0x8771, lsl #16 // Load upper half of constant 0x8771f681
add w17, w17, w5 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0x8771f681
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x6122 // Load lower half of constant 0x6d9d6122
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8])
movk x13, #0x6d9d, lsl #16 // Load upper half of constant 0x6d9d6122
add w8, w8, w25 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0x6d9d6122
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x380c // Load lower half of constant 0xfde5380c
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11])
movk x13, #0xfde5, lsl #16 // Load upper half of constant 0xfde5380c
add w9, w9, w12 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xfde5380c
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0xea44 // Load lower half of constant 0xa4beea44
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14])
movk x10, #0xa4be, lsl #16 // Load upper half of constant 0xa4beea44
add w4, w4, w20 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xa4beea44
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0xcfa9 // Load lower half of constant 0x4bdecfa9
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1])
movk x10, #0x4bde, lsl #16 // Load upper half of constant 0x4bdecfa9
add w17, w17, w14 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0x4bdecfa9
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x4b60 // Load lower half of constant 0xf6bb4b60
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4])
movk x13, #0xf6bb, lsl #16 // Load upper half of constant 0xf6bb4b60
add w8, w8, w23 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0xf6bb4b60
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0xbc70 // Load lower half of constant 0xbebfbc70
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7])
movk x13, #0xbebf, lsl #16 // Load upper half of constant 0xbebfbc70
add w9, w9, w16 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xbebfbc70
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0x7ec6 // Load lower half of constant 0x289b7ec6
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10])
movk x10, #0x289b, lsl #16 // Load upper half of constant 0x289b7ec6
add w4, w4, w26 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0x289b7ec6
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0x27fa // Load lower half of constant 0xeaa127fa
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13])
movk x10, #0xeaa1, lsl #16 // Load upper half of constant 0xeaa127fa
add w17, w17, w15 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0xeaa127fa
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x3085 // Load lower half of constant 0xd4ef3085
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0])
movk x13, #0xd4ef, lsl #16 // Load upper half of constant 0xd4ef3085
add w8, w8, w21 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0xd4ef3085
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x1d05 // Load lower half of constant 0x4881d05
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3])
movk x13, #0x488, lsl #16 // Load upper half of constant 0x4881d05
add w9, w9, w7 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0x4881d05
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0xd039 // Load lower half of constant 0xd9d4d039
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6])
movk x10, #0xd9d4, lsl #16 // Load upper half of constant 0xd9d4d039
add w4, w4, w24 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xd9d4d039
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0x99e5 // Load lower half of constant 0xe6db99e5
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9])
movk x10, #0xe6db, lsl #16 // Load upper half of constant 0xe6db99e5
add w17, w17, w11 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0xe6db99e5
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x7cf8 // Load lower half of constant 0x1fa27cf8
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12])
movk x13, #0x1fa2, lsl #16 // Load upper half of constant 0x1fa27cf8
add w8, w8, w27 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0x1fa27cf8
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x5665 // Load lower half of constant 0xc4ac5665
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15])
movk x13, #0xc4ac, lsl #16 // Load upper half of constant 0xc4ac5665
add w9, w9, w3 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xc4ac5665
add w9, w9, w6 // Add aux function result
ror w9, w9, #9 // Rotate left s=23 bits
movz x6, #0x2244 // Load lower half of constant 0xf4292244
movk x6, #0xf429, lsl #16 // Load upper half of constant 0xf4292244
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2])
add w4, w4, w15 // Add dest value
orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w4, w6 // Add constant 0xf4292244
eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w4, w6 // Add aux function result
ror w4, w4, #26 // Rotate left s=6 bits
movz x6, #0xff97 // Load lower half of constant 0x432aff97
movk x6, #0x432a, lsl #16 // Load upper half of constant 0x432aff97
add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0])
orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w17, w23 // Add dest value
eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w17, w6 // Add constant 0x432aff97
add w6, w17, w10 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x17, #0x23a7 // Load lower half of constant 0xab9423a7
movk x17, #0xab94, lsl #16 // Load upper half of constant 0xab9423a7
add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7])
add w8, w8, w12 // Add dest value
orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w17 // Add constant 0xab9423a7
eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w17 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x17, #0xa039 // Load lower half of constant 0xfc93a039
movk x17, #0xfc93, lsl #16 // Load upper half of constant 0xfc93a039
add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14])
orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w22 // Add dest value
eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w17 // Add constant 0xfc93a039
add w17, w9, w13 // Add aux function result
ror w17, w17, #11 // Rotate left s=21 bits
movz x9, #0x59c3 // Load lower half of constant 0x655b59c3
movk x9, #0x655b, lsl #16 // Load upper half of constant 0x655b59c3
add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5])
add w4, w4, w11 // Add dest value
orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w4, w9 // Add constant 0x655b59c3
eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w4 // Add aux function result
ror w9, w9, #26 // Rotate left s=6 bits
movz x4, #0xcc92 // Load lower half of constant 0x8f0ccc92
movk x4, #0x8f0c, lsl #16 // Load upper half of constant 0x8f0ccc92
add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12])
orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w6, w21 // Add dest value
eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w6, w4 // Add constant 0x8f0ccc92
add w6, w4, w10 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x4, #0xf47d // Load lower half of constant 0xffeff47d
movk x4, #0xffef, lsl #16 // Load upper half of constant 0xffeff47d
add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3])
add w8, w8, w16 // Add dest value
orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w4 // Add constant 0xffeff47d
eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w4 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x4, #0x5dd1 // Load lower half of constant 0x85845dd1
movk x4, #0x8584, lsl #16 // Load upper half of constant 0x85845dd1
add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10])
orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w17, w20 // Add dest value
eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w15, w4 // Add constant 0x85845dd1
add w4, w15, w17 // Add aux function result
ror w4, w4, #11 // Rotate left s=21 bits
movz x15, #0x7e4f // Load lower half of constant 0x6fa87e4f
movk x15, #0x6fa8, lsl #16 // Load upper half of constant 0x6fa87e4f
add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1])
add w4, w9, w5 // Add dest value
orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w4, w15 // Add constant 0x6fa87e4f
eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w15, w4 // Add aux function result
ror w9, w9, #26 // Rotate left s=6 bits
movz x15, #0xe6e0 // Load lower half of constant 0xfe2ce6e0
movk x15, #0xfe2c, lsl #16 // Load upper half of constant 0xfe2ce6e0
add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8])
orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w6, w27 // Add dest value
eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w6, w15 // Add constant 0xfe2ce6e0
add w6, w15, w9 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x9, #0x4314 // Load lower half of constant 0xa3014314
movk x9, #0xa301, lsl #16 // Load upper half of constant 0xa3014314
add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15])
add w6, w8, w7 // Add dest value
orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w6, w9 // Add constant 0xa3014314
eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w8, w9 // Add aux function result
ror w6, w6, #17 // Rotate left s=15 bits
movz x7, #0x11a1 // Load lower half of constant 0x4e0811a1
movk x7, #0x4e08, lsl #16 // Load upper half of constant 0x4e0811a1
add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6])
orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w17, w26 // Add dest value
eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w6, w7 // Add constant 0x4e0811a1
add w7, w9, w17 // Add aux function result
ror w7, w7, #11 // Rotate left s=21 bits
movz x6, #0x7e82 // Load lower half of constant 0xf7537e82
movk x6, #0xf753, lsl #16 // Load upper half of constant 0xf7537e82
add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13])
add w17, w4, w14 // Add dest value
orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w14, w17, w6 // Add constant 0xf7537e82
eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w14, w4 // Add aux function result
ror w17, w17, #26 // Rotate left s=6 bits
movz x6, #0xf235 // Load lower half of constant 0xbd3af235
movk x6, #0xbd3a, lsl #16 // Load upper half of constant 0xbd3af235
add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4])
orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w15, w25 // Add dest value
eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w4, w6 // Add constant 0xbd3af235
add w16, w15, w17 // Add aux function result
ror w16, w16, #22 // Rotate left s=10 bits
movz x14, #0xd2bb // Load lower half of constant 0x2ad7d2bb
movk x14, #0x2ad7, lsl #16 // Load upper half of constant 0x2ad7d2bb
add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11])
add w6, w8, w3 // Add dest value
orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w6, w14 // Add constant 0x2ad7d2bb
eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w17, w16 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x3, #0xd391 // Load lower half of constant 0xeb86d391
movk x3, #0xeb86, lsl #16 // Load upper half of constant 0xeb86d391
add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2])
orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w9, w24 // Add dest value
eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w16, w15, w3 // Add constant 0xeb86d391
add w8, w16, w17 // Add aux function result
ror w8, w8, #11 // Rotate left s=21 bits
ldp w6, w15, [x0] // Reload MD5 state->A and state->B
ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D
add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9])
add w13, w4, w9 // Add result of MD5 rounds to state->D
add w12, w14, w5 // Add result of MD5 rounds to state->C
add w10, w7, w6 // Add result of MD5 rounds to state->A
add w11, w3, w15 // Add result of MD5 rounds to state->B
stp w12, w13, [x0, #8] // Store MD5 states C,D
stp w10, w11, [x0] // Store MD5 states A,B
add x1, x1, #64 // Increment data pointer
subs w2, w2, #1 // Decrement block counter
b.ne Lmd5_blocks_loop
ldp x21,x22,[sp,#16]
.cfi_restore x21
.cfi_restore x22
ldp x23,x24,[sp,#32]
.cfi_restore x23
.cfi_restore x24
ldp x25,x26,[sp,#48]
.cfi_restore x25
.cfi_restore x26
ldp x27,x28,[sp,#64]
.cfi_restore x27
.cfi_restore x28
ldp x19,x20,[sp],#80
.cfi_restore x19
.cfi_restore x20
.cfi_def_cfa_offset 0
ret
.cfi_endproc
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,309 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include "openssl/arm_arch.h"
.text
.globl beeu_mod_inverse_vartime
.align 4
beeu_mod_inverse_vartime:
// Reserve enough space for 14 8-byte registers on the stack
// in the first stp call for x29, x30.
// Then store the remaining callee-saved registers.
//
// | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
// ^ ^
// sp <------------------- 112 bytes ----------------> old sp
// x29 (FP)
//
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-112]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
stp x0,x2,[sp,#96]
// B = b3..b0 := a
ldp x25,x26,[x1]
ldp x27,x28,[x1,#16]
// n3..n0 := n
// Note: the value of input params are changed in the following.
ldp x0,x1,[x2]
ldp x2,x30,[x2,#16]
// A = a3..a0 := n
mov x21, x0
mov x22, x1
mov x23, x2
mov x24, x30
// X = x4..x0 := 1
mov x3, #1
eor x4, x4, x4
eor x5, x5, x5
eor x6, x6, x6
eor x7, x7, x7
// Y = y4..y0 := 0
eor x8, x8, x8
eor x9, x9, x9
eor x10, x10, x10
eor x11, x11, x11
eor x12, x12, x12
Lbeeu_loop:
// if B == 0, jump to .Lbeeu_loop_end
orr x14, x25, x26
orr x14, x14, x27
// reverse the bit order of x25. This is needed for clz after this macro
rbit x15, x25
orr x14, x14, x28
cbz x14,Lbeeu_loop_end
// 0 < B < |n|,
// 0 < A <= |n|,
// (1) X*a == B (mod |n|),
// (2) (-1)*Y*a == A (mod |n|)
// Now divide B by the maximum possible power of two in the
// integers, and divide X by the same value mod |n|.
// When we're done, (1) still holds.
// shift := number of trailing 0s in x25
// ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
clz x13, x15
// If there is no shift, goto shift_A_Y
cbz x13, Lbeeu_shift_A_Y
// Shift B right by "x13" bits
neg x14, x13
lsr x25, x25, x13
lsl x15, x26, x14
lsr x26, x26, x13
lsl x19, x27, x14
orr x25, x25, x15
lsr x27, x27, x13
lsl x20, x28, x14
orr x26, x26, x19
lsr x28, x28, x13
orr x27, x27, x20
// Shift X right by "x13" bits, adding n whenever X becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
Lbeeu_shift_loop_X:
tbz x3, #0, Lshift1_0
adds x3, x3, x0
adcs x4, x4, x1
adcs x5, x5, x2
adcs x6, x6, x30
adc x7, x7, x14
Lshift1_0:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x3, x4, x3, #1
extr x4, x5, x4, #1
extr x5, x6, x5, #1
extr x6, x7, x6, #1
lsr x7, x7, #1
subs x13, x13, #1
bne Lbeeu_shift_loop_X
// Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
// with the following differences:
// - "x13" is set directly to the number of trailing 0s in B
// (using rbit and clz instructions)
// - The loop is only used to call SHIFT1(X)
// and x13 is decreased while executing the X loop.
// - SHIFT256(B, x13) is performed before right-shifting X; they are independent
Lbeeu_shift_A_Y:
// Same for A and Y.
// Afterwards, (2) still holds.
// Reverse the bit order of x21
// x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
rbit x15, x21
clz x13, x15
// If there is no shift, goto |B-A|, X+Y update
cbz x13, Lbeeu_update_B_X_or_A_Y
// Shift A right by "x13" bits
neg x14, x13
lsr x21, x21, x13
lsl x15, x22, x14
lsr x22, x22, x13
lsl x19, x23, x14
orr x21, x21, x15
lsr x23, x23, x13
lsl x20, x24, x14
orr x22, x22, x19
lsr x24, x24, x13
orr x23, x23, x20
// Shift Y right by "x13" bits, adding n whenever Y becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
Lbeeu_shift_loop_Y:
tbz x8, #0, Lshift1_1
adds x8, x8, x0
adcs x9, x9, x1
adcs x10, x10, x2
adcs x11, x11, x30
adc x12, x12, x14
Lshift1_1:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x8, x9, x8, #1
extr x9, x10, x9, #1
extr x10, x11, x10, #1
extr x11, x12, x11, #1
lsr x12, x12, #1
subs x13, x13, #1
bne Lbeeu_shift_loop_Y
Lbeeu_update_B_X_or_A_Y:
// Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
// Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
// without taking a sign bit if generated. The lack of a carry would
// indicate a negative result. See, for example,
// https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
subs x14, x25, x21
sbcs x15, x26, x22
sbcs x19, x27, x23
sbcs x20, x28, x24
bcs Lbeeu_B_greater_than_A
// Else A > B =>
// A := A - B; Y := Y + X; goto beginning of the loop
subs x21, x21, x25
sbcs x22, x22, x26
sbcs x23, x23, x27
sbcs x24, x24, x28
adds x8, x8, x3
adcs x9, x9, x4
adcs x10, x10, x5
adcs x11, x11, x6
adc x12, x12, x7
b Lbeeu_loop
Lbeeu_B_greater_than_A:
// Continue with B > A =>
// B := B - A; X := X + Y; goto beginning of the loop
mov x25, x14
mov x26, x15
mov x27, x19
mov x28, x20
adds x3, x3, x8
adcs x4, x4, x9
adcs x5, x5, x10
adcs x6, x6, x11
adc x7, x7, x12
b Lbeeu_loop
Lbeeu_loop_end:
// The Euclid's algorithm loop ends when A == gcd(a,n);
// this would be 1, when a and n are co-prime (i.e. do not have a common factor).
// Since (-1)*Y*a == A (mod |n|), Y>0
// then out = -Y mod n
// Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
// Is A-1 == 0?
// If not, fail.
sub x14, x21, #1
orr x14, x14, x22
orr x14, x14, x23
orr x14, x14, x24
cbnz x14, Lbeeu_err
// If Y>n ==> Y:=Y-n
Lbeeu_reduction_loop:
// x_i := y_i - n_i (X is no longer needed, use it as temp)
// (x14 = 0 from above)
subs x3, x8, x0
sbcs x4, x9, x1
sbcs x5, x10, x2
sbcs x6, x11, x30
sbcs x7, x12, x14
// If result is non-negative (i.e., cs = carry set = no borrow),
// y_i := x_i; goto reduce again
// else
// y_i := y_i; continue
csel x8, x3, x8, cs
csel x9, x4, x9, cs
csel x10, x5, x10, cs
csel x11, x6, x11, cs
csel x12, x7, x12, cs
bcs Lbeeu_reduction_loop
// Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
// out = -Y = n-Y
subs x8, x0, x8
sbcs x9, x1, x9
sbcs x10, x2, x10
sbcs x11, x30, x11
// Save Y in output (out (x0) was saved on the stack)
ldr x3, [sp,#96]
stp x8, x9, [x3]
stp x10, x11, [x3,#16]
// return 1 (success)
mov x0, #1
b Lbeeu_finish
Lbeeu_err:
// return 0 (error)
eor x0, x0, x0
Lbeeu_finish:
// Restore callee-saved registers, except x0, x2
add sp,x29,#0
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldp x29,x30,[sp],#112
AARCH64_VALIDATE_LINK_REGISTER
ret
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

View File

@@ -0,0 +1,39 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32)
#include <openssl/arm_arch.h>
.arch armv8-a
.text
# int CRYPTO_rndr_multiple8(uint8_t *out, const size_t len)
.globl CRYPTO_rndr_multiple8
.def CRYPTO_rndr_multiple8
.type 32
.endef
.align 4
CRYPTO_rndr_multiple8:
cbz x1, Lrndr_multiple8_error // len = 0 is not supported
Lrndr_multiple8_loop:
mrs x2, s3_3_c2_c4_0 // rndr instruction https://developer.arm.com/documentation/ddi0601/2024-09/Index-by-Encoding
cbz x2, Lrndr_multiple8_error // Check if rndr failed
str x2, [x0], #8 // Copy 8 bytes to *out and increment pointer by 8
sub x1, x1, #8
cbz x1, Lrndr_multiple8_done // If multiple of 8 this will be 0 eventually
b Lrndr_multiple8_loop
Lrndr_multiple8_done:
mov x0, #1 // Return value success
ret
Lrndr_multiple8_error:
mov x0, #0 // Return value error
ret
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff