chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,855 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
.arch armv8-a+crypto
.section .rodata
.align 5
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.text
.globl aes_hw_set_encrypt_key
.hidden aes_hw_set_encrypt_key
.type aes_hw_set_encrypt_key,%function
.align 5
aes_hw_set_encrypt_key:
.cfi_startproc
.Lenc_key:
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#3] // kFlag_aes_hw_set_encrypt_key
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
mov x3,#-1
cmp x0,#0
b.eq .Lenc_key_abort
cmp x2,#0
b.eq .Lenc_key_abort
mov x3,#-2
cmp w1,#128
b.lt .Lenc_key_abort
cmp w1,#256
b.gt .Lenc_key_abort
tst w1,#0x3f
b.ne .Lenc_key_abort
adrp x3,.Lrcon
add x3,x3,:lo12:.Lrcon
cmp w1,#192
eor v0.16b,v0.16b,v0.16b
ld1 {v3.16b},[x0],#16
mov w1,#8 // reuse w1
ld1 {v1.4s,v2.4s},[x3],#32
b.lt .Loop128
b.eq .L192
b .L256
.align 4
.Loop128:
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
b.ne .Loop128
ld1 {v1.4s},[x3]
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
tbl v6.16b,{v3.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v3.4s},[x2],#16
aese v6.16b,v0.16b
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2]
add x2,x2,#0x50
mov w12,#10
b .Ldone
.align 4
.L192:
ld1 {v4.8b},[x0],#8
movi v6.16b,#8 // borrow v6.16b
st1 {v3.4s},[x2],#16
sub v2.16b,v2.16b,v6.16b // adjust the mask
.Loop192:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.8b},[x2],#8
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
dup v5.4s,v3.s[3]
eor v5.16b,v5.16b,v4.16b
eor v6.16b,v6.16b,v1.16b
ext v4.16b,v0.16b,v4.16b,#12
shl v1.16b,v1.16b,#1
eor v4.16b,v4.16b,v5.16b
eor v3.16b,v3.16b,v6.16b
eor v4.16b,v4.16b,v6.16b
st1 {v3.4s},[x2],#16
b.ne .Loop192
mov w12,#12
add x2,x2,#0x20
b .Ldone
.align 4
.L256:
ld1 {v4.16b},[x0]
mov w1,#7
mov w12,#14
st1 {v3.4s},[x2],#16
.Loop256:
tbl v6.16b,{v4.16b},v2.16b
ext v5.16b,v0.16b,v3.16b,#12
st1 {v4.4s},[x2],#16
aese v6.16b,v0.16b
subs w1,w1,#1
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v3.16b,v3.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v6.16b,v6.16b,v1.16b
eor v3.16b,v3.16b,v5.16b
shl v1.16b,v1.16b,#1
eor v3.16b,v3.16b,v6.16b
st1 {v3.4s},[x2],#16
b.eq .Ldone
dup v6.4s,v3.s[3] // just splat
ext v5.16b,v0.16b,v4.16b,#12
aese v6.16b,v0.16b
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
ext v5.16b,v0.16b,v5.16b,#12
eor v4.16b,v4.16b,v5.16b
eor v4.16b,v4.16b,v6.16b
b .Loop256
.Ldone:
str w12,[x2]
mov x3,#0
.Lenc_key_abort:
mov x0,x3 // return value
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
.globl aes_hw_set_decrypt_key
.hidden aes_hw_set_decrypt_key
.type aes_hw_set_decrypt_key,%function
.align 5
aes_hw_set_decrypt_key:
.cfi_startproc
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
bl .Lenc_key
cmp x0,#0
b.ne .Ldec_key_abort
sub x2,x2,#240 // restore original x2
mov x4,#-16
add x0,x2,x12,lsl#4 // end of key schedule
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
.Loop_imc:
ld1 {v0.4s},[x2]
ld1 {v1.4s},[x0]
aesimc v0.16b,v0.16b
aesimc v1.16b,v1.16b
st1 {v0.4s},[x0],x4
st1 {v1.4s},[x2],#16
cmp x0,x2
b.hi .Loop_imc
ld1 {v0.4s},[x2]
aesimc v0.16b,v0.16b
st1 {v0.4s},[x0]
eor x0,x0,x0 // return value
.Ldec_key_abort:
ldp x29,x30,[sp],#16
.cfi_restore x29
.cfi_restore x30
.cfi_def_cfa_offset 0
AARCH64_VALIDATE_LINK_REGISTER
ret
.cfi_endproc
.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
.globl aes_hw_encrypt
.hidden aes_hw_encrypt
.type aes_hw_encrypt,%function
.align 5
aes_hw_encrypt:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#1] // kFlag_aes_hw_encrypt
#endif
AARCH64_VALID_CALL_TARGET
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_enc:
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aese v2.16b,v1.16b
aesmc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_enc
aese v2.16b,v0.16b
aesmc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aese v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.cfi_endproc
.size aes_hw_encrypt,.-aes_hw_encrypt
.globl aes_hw_decrypt
.hidden aes_hw_decrypt
.type aes_hw_decrypt,%function
.align 5
aes_hw_decrypt:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#1] // kFlag_aes_hw_encrypt
#endif
AARCH64_VALID_CALL_TARGET
ldr w3,[x2,#240]
ld1 {v0.4s},[x2],#16
ld1 {v2.16b},[x0]
sub w3,w3,#2
ld1 {v1.4s},[x2],#16
.Loop_dec:
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2],#16
subs w3,w3,#2
aesd v2.16b,v1.16b
aesimc v2.16b,v2.16b
ld1 {v1.4s},[x2],#16
b.gt .Loop_dec
aesd v2.16b,v0.16b
aesimc v2.16b,v2.16b
ld1 {v0.4s},[x2]
aesd v2.16b,v1.16b
eor v2.16b,v2.16b,v0.16b
st1 {v2.16b},[x1]
ret
.cfi_endproc
.size aes_hw_decrypt,.-aes_hw_decrypt
.globl aes_hw_cbc_encrypt
.hidden aes_hw_cbc_encrypt
.type aes_hw_cbc_encrypt,%function
.align 5
aes_hw_cbc_encrypt:
.cfi_startproc
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
subs x2,x2,#16
mov x8,#16
b.lo .Lcbc_abort
csel x8,xzr,x8,eq
cmp w5,#0 // en- or decrypting?
ldr w5,[x3,#240]
and x2,x2,#-16
ld1 {v6.16b},[x4]
ld1 {v0.16b},[x0],x8
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#6
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
sub w5,w5,#2
ld1 {v18.4s,v19.4s},[x7],#32
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
b.eq .Lcbc_dec
cmp w5,#2
eor v0.16b,v0.16b,v6.16b
eor v5.16b,v16.16b,v7.16b
b.eq .Lcbc_enc128
ld1 {v2.4s,v3.4s},[x7]
add x7,x3,#16
add x6,x3,#16*4
add x12,x3,#16*5
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
add x14,x3,#16*6
add x3,x3,#16*7
b .Lenter_cbc_enc
.align 4
.Loop_cbc_enc:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x6]
cmp w5,#4
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x12]
b.eq .Lcbc_enc192
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
ld1 {v16.4s},[x14]
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x3]
nop
.Lcbc_enc192:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_enc128:
ld1 {v2.4s,v3.4s},[x7]
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
b .Lenter_cbc_enc128
.Loop_cbc_enc128:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
st1 {v6.16b},[x1],#16
.Lenter_cbc_enc128:
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
subs x2,x2,#16
aese v0.16b,v2.16b
aesmc v0.16b,v0.16b
csel x8,xzr,x8,eq
aese v0.16b,v3.16b
aesmc v0.16b,v0.16b
aese v0.16b,v18.16b
aesmc v0.16b,v0.16b
aese v0.16b,v19.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x0],x8
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
eor v16.16b,v16.16b,v5.16b
aese v0.16b,v23.16b
eor v6.16b,v0.16b,v7.16b
b.hs .Loop_cbc_enc128
st1 {v6.16b},[x1],#16
b .Lcbc_done
.align 5
.Lcbc_dec:
ld1 {v18.16b},[x0],#16
subs x2,x2,#32 // bias
add w6,w5,#2
orr v3.16b,v0.16b,v0.16b
orr v1.16b,v0.16b,v0.16b
orr v19.16b,v18.16b,v18.16b
b.lo .Lcbc_dec_tail
orr v1.16b,v18.16b,v18.16b
ld1 {v18.16b},[x0],#16
orr v2.16b,v0.16b,v0.16b
orr v3.16b,v1.16b,v1.16b
orr v19.16b,v18.16b,v18.16b
.Loop3x_cbc_dec:
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_cbc_dec
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
eor v4.16b,v6.16b,v7.16b
subs x2,x2,#0x30
eor v5.16b,v2.16b,v7.16b
csel x6,x2,x6,lo // x6, w6, is zero at this point
aesd v0.16b,v17.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
add x0,x0,x6 // x0 is adjusted in such way that
// at exit from the loop v1.16b-v18.16b
// are loaded with last "words"
orr v6.16b,v19.16b,v19.16b
mov x7,x3
aesd v0.16b,v20.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
ld1 {v2.16b},[x0],#16
aesd v0.16b,v21.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
aesd v0.16b,v22.16b
aesimc v0.16b,v0.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
ld1 {v19.16b},[x0],#16
aesd v0.16b,v23.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
add w6,w5,#2
eor v4.16b,v4.16b,v0.16b
eor v5.16b,v5.16b,v1.16b
eor v18.16b,v18.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v4.16b},[x1],#16
orr v0.16b,v2.16b,v2.16b
st1 {v5.16b},[x1],#16
orr v1.16b,v3.16b,v3.16b
st1 {v18.16b},[x1],#16
orr v18.16b,v19.16b,v19.16b
b.hs .Loop3x_cbc_dec
cmn x2,#0x30
b.eq .Lcbc_done
nop
.Lcbc_dec_tail:
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Lcbc_dec_tail
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v16.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v17.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v17.16b
aesimc v18.16b,v18.16b
aesd v1.16b,v20.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v20.16b
aesimc v18.16b,v18.16b
cmn x2,#0x20
aesd v1.16b,v21.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v21.16b
aesimc v18.16b,v18.16b
eor v5.16b,v6.16b,v7.16b
aesd v1.16b,v22.16b
aesimc v1.16b,v1.16b
aesd v18.16b,v22.16b
aesimc v18.16b,v18.16b
eor v17.16b,v3.16b,v7.16b
aesd v1.16b,v23.16b
aesd v18.16b,v23.16b
b.eq .Lcbc_dec_one
eor v5.16b,v5.16b,v1.16b
eor v17.16b,v17.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
st1 {v17.16b},[x1],#16
b .Lcbc_done
.Lcbc_dec_one:
eor v5.16b,v5.16b,v18.16b
orr v6.16b,v19.16b,v19.16b
st1 {v5.16b},[x1],#16
.Lcbc_done:
st1 {v6.16b},[x4]
.Lcbc_abort:
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
.globl aes_hw_ctr32_encrypt_blocks
.hidden aes_hw_ctr32_encrypt_blocks
.type aes_hw_ctr32_encrypt_blocks,%function
.align 5
aes_hw_ctr32_encrypt_blocks:
.cfi_startproc
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9] // kFlag_aes_hw_ctr32_encrypt_blocks
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
.cfi_def_cfa_offset 16
.cfi_offset x29, -16
.cfi_offset x30, -8
add x29,sp,#0
.cfi_def_cfa x29, 16
ldr w5,[x3,#240]
ldr w8, [x4, #12]
ld1 {v0.4s},[x4]
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
sub w5,w5,#4
mov x12,#16
cmp x2,#2
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
sub w5,w5,#2
ld1 {v20.4s,v21.4s},[x7],#32
ld1 {v22.4s,v23.4s},[x7],#32
ld1 {v7.4s},[x7]
add x7,x3,#32
mov w6,w5
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
// affected by silicon errata #1742098 [0] and #1655431 [1],
// respectively, where the second instruction of an aese/aesmc
// instruction pair may execute twice if an interrupt is taken right
// after the first instruction consumes an input register of which a
// single 32-bit lane has been updated the last time it was modified.
//
// This function uses a counter in one 32-bit lane. The vmov lines
// could write to v1.16b and v18.16b directly, but that trips this bugs.
// We write to v6.16b and copy to the final register as a workaround.
//
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __AARCH64EB__
rev w8, w8
#endif
add w10, w8, #1
orr v6.16b,v0.16b,v0.16b
rev w10, w10
mov v6.s[3],w10
add w8, w8, #2
orr v1.16b,v6.16b,v6.16b
b.ls .Lctr32_tail
rev w12, w8
mov v6.s[3],w12
sub x2,x2,#3 // bias
orr v18.16b,v6.16b,v6.16b
b .Loop3x_ctr32
.align 4
.Loop3x_ctr32:
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
aese v18.16b,v17.16b
aesmc v18.16b,v18.16b
ld1 {v17.4s},[x7],#16
b.gt .Loop3x_ctr32
aese v0.16b,v16.16b
aesmc v4.16b,v0.16b
aese v1.16b,v16.16b
aesmc v5.16b,v1.16b
ld1 {v2.16b},[x0],#16
add w9,w8,#1
aese v18.16b,v16.16b
aesmc v18.16b,v18.16b
ld1 {v3.16b},[x0],#16
rev w9,w9
aese v4.16b,v17.16b
aesmc v4.16b,v4.16b
aese v5.16b,v17.16b
aesmc v5.16b,v5.16b
ld1 {v19.16b},[x0],#16
mov x7,x3
aese v18.16b,v17.16b
aesmc v17.16b,v18.16b
aese v4.16b,v20.16b
aesmc v4.16b,v4.16b
aese v5.16b,v20.16b
aesmc v5.16b,v5.16b
eor v2.16b,v2.16b,v7.16b
add w10,w8,#2
aese v17.16b,v20.16b
aesmc v17.16b,v17.16b
eor v3.16b,v3.16b,v7.16b
add w8,w8,#3
aese v4.16b,v21.16b
aesmc v4.16b,v4.16b
aese v5.16b,v21.16b
aesmc v5.16b,v5.16b
// Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
// 32-bit mode. See the comment above.
eor v19.16b,v19.16b,v7.16b
mov v6.s[3], w9
aese v17.16b,v21.16b
aesmc v17.16b,v17.16b
orr v0.16b,v6.16b,v6.16b
rev w10,w10
aese v4.16b,v22.16b
aesmc v4.16b,v4.16b
mov v6.s[3], w10
rev w12,w8
aese v5.16b,v22.16b
aesmc v5.16b,v5.16b
orr v1.16b,v6.16b,v6.16b
mov v6.s[3], w12
aese v17.16b,v22.16b
aesmc v17.16b,v17.16b
orr v18.16b,v6.16b,v6.16b
subs x2,x2,#3
aese v4.16b,v23.16b
aese v5.16b,v23.16b
aese v17.16b,v23.16b
eor v2.16b,v2.16b,v4.16b
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
st1 {v2.16b},[x1],#16
eor v3.16b,v3.16b,v5.16b
mov w6,w5
st1 {v3.16b},[x1],#16
eor v19.16b,v19.16b,v17.16b
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
st1 {v19.16b},[x1],#16
b.hs .Loop3x_ctr32
adds x2,x2,#3
b.eq .Lctr32_done
.Lctr32_tail:
cmp x2,#1
b.lt .Lctr32_done // if len = 0, go to done
mov x12,#16
csel x12,xzr,x12,eq
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
ld1 {v16.4s},[x7],#16
subs w6,w6,#2
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v17.4s},[x7],#16
b.gt .Lctr32_tail
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
aese v0.16b,v17.16b
aesmc v0.16b,v0.16b
aese v1.16b,v17.16b
aesmc v1.16b,v1.16b
ld1 {v2.16b},[x0],x12
aese v0.16b,v20.16b
aesmc v0.16b,v0.16b
aese v1.16b,v20.16b
aesmc v1.16b,v1.16b
ld1 {v3.16b},[x0]
aese v0.16b,v21.16b
aesmc v0.16b,v0.16b
aese v1.16b,v21.16b
aesmc v1.16b,v1.16b
eor v2.16b,v2.16b,v7.16b
aese v0.16b,v22.16b
aesmc v0.16b,v0.16b
aese v1.16b,v22.16b
aesmc v1.16b,v1.16b
eor v3.16b,v3.16b,v7.16b
aese v0.16b,v23.16b
aese v1.16b,v23.16b
eor v2.16b,v2.16b,v0.16b
eor v3.16b,v3.16b,v1.16b
st1 {v2.16b},[x1],#16
cbz x12,.Lctr32_done // if step = 0 (len = 1), go to done
st1 {v3.16b},[x1]
.Lctr32_done:
ldr x29,[sp],#16
.cfi_restore x29
.cfi_def_cfa_offset 0
ret
.cfi_endproc
.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
#endif
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,93 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
.text
// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_add_words, %function
.globl bn_add_words
.hidden bn_add_words
.align 4
bn_add_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Clear the carry flag.
cmn xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, .Ladd_tail
.Ladd_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
adcs x4, x4, x6
adcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, .Ladd_loop
.Ladd_tail:
cbz x3, .Ladd_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
adcs x4, x4, x6
str x4, [x0], #8
.Ladd_exit:
cset x0, cs
ret
.cfi_endproc
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_sub_words, %function
.globl bn_sub_words
.hidden bn_sub_words
.align 4
bn_sub_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
# so we want C = 1 here.
cmp xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, .Lsub_tail
.Lsub_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
sbcs x4, x4, x6
sbcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, .Lsub_loop
.Lsub_tail:
cbz x3, .Lsub_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
sbcs x4, x4, x6
str x4, [x0], #8
.Lsub_exit:
cset x0, cc
ret
.cfi_endproc
.size bn_sub_words,.-bn_sub_words
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

View File

@@ -0,0 +1,335 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
.text
.globl gcm_init_neon
.hidden gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
AARCH64_VALID_CALL_TARGET
// This function is adapted from gcm_init_v8. xC2 is t3.
ld1 {v17.2d}, [x1] // load H
movi v19.16b, #0xe1
shl v19.2d, v19.2d, #57 // 0xc2.0
ext v3.16b, v17.16b, v17.16b, #8
ushr v18.2d, v19.2d, #63
dup v17.4s, v17.s[1]
ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
ushr v18.2d, v3.2d, #63
sshr v17.4s, v17.4s, #31 // broadcast carry bit
and v18.16b, v18.16b, v16.16b
shl v3.2d, v3.2d, #1
ext v18.16b, v18.16b, v18.16b, #8
and v16.16b, v16.16b, v17.16b
orr v3.16b, v3.16b, v18.16b // H<<<=1
eor v5.16b, v3.16b, v16.16b // twisted H
st1 {v5.2d}, [x0] // store Htable[0]
ret
.size gcm_init_neon,.-gcm_init_neon
.globl gcm_gmult_neon
.hidden gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v3.16b}, [x0] // load Xi
ld1 {v5.1d}, [x1], #8 // load twisted H
ld1 {v6.1d}, [x1]
adrp x9, .Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {v24.2d, v25.2d}, [x9]
rev64 v3.16b, v3.16b // byteswap Xi
ext v3.16b, v3.16b, v3.16b, #8
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
mov x3, #16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.globl gcm_ghash_neon
.hidden gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
AARCH64_VALID_CALL_TARGET
ld1 {v0.16b}, [x0] // load Xi
ld1 {v5.1d}, [x1], #8 // load twisted H
ld1 {v6.1d}, [x1]
adrp x9, .Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {v24.2d, v25.2d}, [x9]
rev64 v0.16b, v0.16b // byteswap Xi
ext v0.16b, v0.16b, v0.16b, #8
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
.Loop_neon:
ld1 {v3.16b}, [x2], #16 // load inp
rev64 v3.16b, v3.16b // byteswap inp
ext v3.16b, v3.16b, v3.16b, #8
eor v3.16b, v3.16b, v0.16b // inp ^= Xi
.Lgmult_neon:
// Split the input into v3 and v4. (The upper halves are unused,
// so it is okay to leave them alone.)
ins v4.d[0], v3.d[1]
ext v16.8b, v5.8b, v5.8b, #1 // A1
pmull v16.8h, v16.8b, v3.8b // F = A1*B
ext v0.8b, v3.8b, v3.8b, #1 // B1
pmull v0.8h, v5.8b, v0.8b // E = A*B1
ext v17.8b, v5.8b, v5.8b, #2 // A2
pmull v17.8h, v17.8b, v3.8b // H = A2*B
ext v19.8b, v3.8b, v3.8b, #2 // B2
pmull v19.8h, v5.8b, v19.8b // G = A*B2
ext v18.8b, v5.8b, v5.8b, #3 // A3
eor v16.16b, v16.16b, v0.16b // L = E + F
pmull v18.8h, v18.8b, v3.8b // J = A3*B
ext v0.8b, v3.8b, v3.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v0.8h, v5.8b, v0.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v3.8b, v3.8b, #4 // B4
eor v18.16b, v18.16b, v0.16b // N = I + J
pmull v19.8h, v5.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v0.8h, v5.8b, v3.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v0.16b, v0.16b, v16.16b
eor v0.16b, v0.16b, v18.16b
eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
ext v16.8b, v7.8b, v7.8b, #1 // A1
pmull v16.8h, v16.8b, v3.8b // F = A1*B
ext v1.8b, v3.8b, v3.8b, #1 // B1
pmull v1.8h, v7.8b, v1.8b // E = A*B1
ext v17.8b, v7.8b, v7.8b, #2 // A2
pmull v17.8h, v17.8b, v3.8b // H = A2*B
ext v19.8b, v3.8b, v3.8b, #2 // B2
pmull v19.8h, v7.8b, v19.8b // G = A*B2
ext v18.8b, v7.8b, v7.8b, #3 // A3
eor v16.16b, v16.16b, v1.16b // L = E + F
pmull v18.8h, v18.8b, v3.8b // J = A3*B
ext v1.8b, v3.8b, v3.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v1.8h, v7.8b, v1.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v3.8b, v3.8b, #4 // B4
eor v18.16b, v18.16b, v1.16b // N = I + J
pmull v19.8h, v7.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v1.8h, v7.8b, v3.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v1.16b, v1.16b, v16.16b
eor v1.16b, v1.16b, v18.16b
ext v16.8b, v6.8b, v6.8b, #1 // A1
pmull v16.8h, v16.8b, v4.8b // F = A1*B
ext v2.8b, v4.8b, v4.8b, #1 // B1
pmull v2.8h, v6.8b, v2.8b // E = A*B1
ext v17.8b, v6.8b, v6.8b, #2 // A2
pmull v17.8h, v17.8b, v4.8b // H = A2*B
ext v19.8b, v4.8b, v4.8b, #2 // B2
pmull v19.8h, v6.8b, v19.8b // G = A*B2
ext v18.8b, v6.8b, v6.8b, #3 // A3
eor v16.16b, v16.16b, v2.16b // L = E + F
pmull v18.8h, v18.8b, v4.8b // J = A3*B
ext v2.8b, v4.8b, v4.8b, #3 // B3
eor v17.16b, v17.16b, v19.16b // M = G + H
pmull v2.8h, v6.8b, v2.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
// vand $t0#hi, $t0#hi, $k48
// veor $t0#lo, $t0#lo, $t0#hi
//
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
// vand $t1#hi, $t1#hi, $k32
// veor $t1#lo, $t1#lo, $t1#hi
//
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
// vand $t2#hi, $t2#hi, $k16
// veor $t2#lo, $t2#lo, $t2#hi
//
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 $t3#hi, #0
//
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext v19.8b, v4.8b, v4.8b, #4 // B4
eor v18.16b, v18.16b, v2.16b // N = I + J
pmull v19.8h, v6.8b, v19.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 v20.2d, v16.2d, v17.2d
zip1 v22.2d, v18.2d, v19.2d
zip2 v21.2d, v16.2d, v17.2d
zip2 v23.2d, v18.2d, v19.2d
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
and v21.16b, v21.16b, v24.16b
and v23.16b, v23.16b, v25.16b
eor v20.16b, v20.16b, v21.16b
eor v22.16b, v22.16b, v23.16b
zip1 v16.2d, v20.2d, v21.2d
zip1 v18.2d, v22.2d, v23.2d
zip2 v17.2d, v20.2d, v21.2d
zip2 v19.2d, v22.2d, v23.2d
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
pmull v2.8h, v6.8b, v4.8b // D = A*B
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
eor v16.16b, v16.16b, v17.16b
eor v18.16b, v18.16b, v19.16b
eor v2.16b, v2.16b, v16.16b
eor v2.16b, v2.16b, v18.16b
ext v16.16b, v0.16b, v2.16b, #8
eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
eor v1.16b, v1.16b, v2.16b
eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
// This is a no-op due to the ins instruction below.
// ins v2.d[0], v1.d[1]
// equivalent of reduction_avx from ghash-x86_64.pl
shl v17.2d, v0.2d, #57 // 1st phase
shl v18.2d, v0.2d, #62
eor v18.16b, v18.16b, v17.16b //
shl v17.2d, v0.2d, #63
eor v18.16b, v18.16b, v17.16b //
// Note Xm contains {Xl.d[1], Xh.d[0]}.
eor v18.16b, v18.16b, v1.16b
ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
ushr v18.2d, v0.2d, #1 // 2nd phase
eor v2.16b, v2.16b,v0.16b
eor v0.16b, v0.16b,v18.16b //
ushr v18.2d, v18.2d, #6
ushr v0.2d, v0.2d, #1 //
eor v0.16b, v0.16b, v2.16b //
eor v0.16b, v0.16b, v18.16b //
subs x3, x3, #16
bne .Loop_neon
rev64 v0.16b, v0.16b // byteswap Xi and write
ext v0.16b, v0.16b, v0.16b, #8
st1 {v0.16b}, [x0]
ret
.size gcm_ghash_neon,.-gcm_ghash_neon
.section .rodata
.align 4
.Lmasks:
.quad 0x0000ffffffffffff // k48
.quad 0x00000000ffffffff // k32
.quad 0x000000000000ffff // k16
.quad 0x0000000000000000 // k0
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

View File

@@ -0,0 +1,665 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
.arch armv8-a+crypto
.globl gcm_init_v8
.hidden gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
AARCH64_VALID_CALL_TARGET
ld1 {v17.2d},[x1] //load input H
movi v19.16b,#0xe1
shl v19.2d,v19.2d,#57 //0xc2.0
ext v3.16b,v17.16b,v17.16b,#8
ushr v18.2d,v19.2d,#63
dup v17.4s,v17.s[1]
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
ushr v18.2d,v3.2d,#63
sshr v17.4s,v17.4s,#31 //broadcast carry bit
and v18.16b,v18.16b,v16.16b
shl v3.2d,v3.2d,#1
ext v18.16b,v18.16b,v18.16b,#8
and v16.16b,v16.16b,v17.16b
orr v3.16b,v3.16b,v18.16b //H<<<=1
eor v20.16b,v3.16b,v16.16b //twisted H
ext v20.16b, v20.16b, v20.16b, #8
st1 {v20.2d},[x0],#16 //store Htable[0]
//calculate H^2
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
pmull2 v0.1q,v20.2d,v20.2d
eor v16.16b,v16.16b,v20.16b
pmull v2.1q,v20.1d,v20.1d
pmull v1.1q,v16.1d,v16.1d
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v17.16b,v0.16b,v18.16b
ext v22.16b,v17.16b,v17.16b,#8 //Karatsuba pre-processing
eor v17.16b,v17.16b,v22.16b
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v21.2d},[x0],#16 //store Htable[1..2]
st1 {v22.2d},[x0],#16 //store Htable[1..2]
//calculate H^3 and H^4
pmull2 v0.1q,v20.2d, v22.2d
pmull2 v5.1q,v22.2d,v22.2d
pmull v2.1q,v20.1d, v22.1d
pmull v7.1q,v22.1d,v22.1d
pmull v1.1q,v16.1d,v17.1d
pmull v6.1q,v17.1d,v17.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b, v0.16b,v18.16b //H^3
eor v17.16b, v5.16b,v4.16b //H^4
ext v23.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing
ext v25.16b,v17.16b,v17.16b,#8
ext v18.16b,v22.16b,v22.16b,#8
eor v16.16b,v16.16b,v23.16b
eor v17.16b,v17.16b,v25.16b
eor v18.16b,v18.16b,v22.16b
ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5]
//calculate H^5 and H^6
pmull2 v0.1q,v22.2d, v23.2d
pmull2 v5.1q,v23.2d,v23.2d
pmull v2.1q,v22.1d, v23.1d
pmull v7.1q,v23.1d,v23.1d
pmull v1.1q,v16.1d,v18.1d
pmull v6.1q,v16.1d,v16.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b,v0.16b,v18.16b //H^5
eor v17.16b,v5.16b,v4.16b //H^6
ext v26.16b, v16.16b, v16.16b,#8 //Karatsuba pre-processing
ext v28.16b, v17.16b, v17.16b,#8
ext v18.16b,v22.16b,v22.16b,#8
eor v16.16b,v16.16b,v26.16b
eor v17.16b,v17.16b,v28.16b
eor v18.16b,v18.16b,v22.16b
ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8]
//calculate H^7 and H^8
pmull2 v0.1q,v22.2d,v26.2d
pmull2 v5.1q,v22.2d,v28.2d
pmull v2.1q,v22.1d,v26.1d
pmull v7.1q,v22.1d,v28.1d
pmull v1.1q,v16.1d,v18.1d
pmull v6.1q,v17.1d,v18.1d
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
ext v17.16b,v5.16b,v7.16b,#8
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v16.16b
eor v4.16b,v5.16b,v7.16b
eor v6.16b,v6.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase
eor v6.16b,v6.16b,v4.16b
pmull v4.1q,v5.1d,v19.1d
ins v2.d[0],v1.d[1]
ins v7.d[0],v6.d[1]
ins v1.d[1],v0.d[0]
ins v6.d[1],v5.d[0]
eor v0.16b,v1.16b,v18.16b
eor v5.16b,v6.16b,v4.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
ext v4.16b,v5.16b,v5.16b,#8
pmull v0.1q,v0.1d,v19.1d
pmull v5.1q,v5.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v4.16b,v4.16b,v7.16b
eor v16.16b,v0.16b,v18.16b //H^7
eor v17.16b,v5.16b,v4.16b //H^8
ext v29.16b,v16.16b,v16.16b,#8 //Karatsuba pre-processing
ext v31.16b,v17.16b,v17.16b,#8
eor v16.16b,v16.16b,v29.16b
eor v17.16b,v17.16b,v31.16b
ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11]
ret
.size gcm_init_v8,.-gcm_init_v8
.globl gcm_gmult_v8
.hidden gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
AARCH64_VALID_CALL_TARGET
ld1 {v17.2d},[x0] //load Xi
movi v19.16b,#0xe1
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
ext v20.16b,v20.16b,v20.16b,#8
shl v19.2d,v19.2d,#57
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v3.16b,v17.16b,v17.16b,#8
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
.globl gcm_ghash_v8
.hidden gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
AARCH64_VALID_CALL_TARGET
cmp x3,#64
b.hs .Lgcm_ghash_v8_4x
ld1 {v0.2d},[x0] //load [rotated] Xi
//"[rotated]" means that
//loaded value would have
//to be rotated in order to
//make it appear as in
//algorithm specification
subs x3,x3,#32 //see if x3 is 32 or larger
mov x12,#16 //x12 is used as post-
//increment for input pointer;
//as loop is modulo-scheduled
//x12 is zeroed just in time
//to preclude overstepping
//inp[len], which means that
//last block[s] are actually
//loaded twice, but last
//copy is not processed
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
ext v20.16b,v20.16b,v20.16b,#8
movi v19.16b,#0xe1
ld1 {v22.2d},[x1]
ext v22.16b,v22.16b,v22.16b,#8
csel x12,xzr,x12,eq //is it time to zero x12?
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v7.16b,v17.16b,v17.16b,#8
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
pmull2 v6.1q,v20.2d,v7.2d
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
ext v18.16b,v3.16b,v3.16b,#8
subs x3,x3,#32 //is there more data?
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
csel x12,xzr,x12,lo //is it time to zero x12?
pmull v5.1q,v21.1d,v17.1d
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
eor v0.16b,v0.16b,v4.16b //accumulate
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
eor v2.16b,v2.16b,v6.16b
csel x12,xzr,x12,eq //is it time to zero x12?
eor v1.16b,v1.16b,v5.16b
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v7.16b,v17.16b,v17.16b,#8
ext v3.16b,v16.16b,v16.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v3.16b,v3.16b,v18.16b
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
eor v3.16b,v3.16b,v0.16b
pmull2 v6.1q,v20.2d,v7.2d
b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
eor v2.16b,v2.16b,v18.16b
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
adds x3,x3,#32 //re-construct x3
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
b.eq .Ldone_v8 //is x3 zero?
.Lodd_tail_v8:
ext v18.16b,v0.16b,v0.16b,#8
eor v3.16b,v3.16b,v0.16b //inp^=Xi
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
.Ldone_v8:
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
.type gcm_ghash_v8_4x,%function
.align 4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
ld1 {v0.2d},[x0] //load [rotated] Xi
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
ext v20.16b,v20.16b,v20.16b,#8
ext v22.16b,v22.16b,v22.16b,#8
movi v19.16b,#0xe1
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
ext v26.16b,v26.16b,v26.16b,#8
ext v28.16b,v28.16b,v28.16b,#8
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
ext v25.16b,v7.16b,v7.16b,#8
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
pmull2 v31.1q,v20.2d,v25.2d
pmull v30.1q,v21.1d,v7.1d
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#128
b.lo .Ltail4x
b .Loop4x
.align 4
.Loop4x:
eor v16.16b,v4.16b,v0.16b
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
ext v3.16b,v16.16b,v16.16b,#8
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
rev64 v4.16b,v4.16b
#endif
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
ext v25.16b,v7.16b,v7.16b,#8
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
ext v24.16b,v6.16b,v6.16b,#8
eor v1.16b,v1.16b,v30.16b
ext v23.16b,v5.16b,v5.16b,#8
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
eor v7.16b,v7.16b,v25.16b
eor v1.16b,v1.16b,v17.16b
pmull2 v31.1q,v20.2d,v25.2d
eor v1.16b,v1.16b,v18.16b
pmull v30.1q,v21.1d,v7.1d
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
eor v6.16b,v6.16b,v24.16b
pmull2 v24.1q,v22.2d,v24.2d
eor v0.16b,v1.16b,v18.16b
pmull2 v6.1q,v21.2d,v6.2d
eor v29.16b,v29.16b,v16.16b
eor v31.16b,v31.16b,v24.16b
eor v30.16b,v30.16b,v6.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v18.16b,v18.16b,v2.16b
pmull2 v23.1q,v26.2d,v23.2d
pmull v5.1q,v27.1d,v5.1d
eor v0.16b,v0.16b,v18.16b
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v30.16b,v30.16b,v5.16b
subs x3,x3,#64
b.hs .Loop4x
.Ltail4x:
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v28.2d,v3.2d
pmull2 v1.1q,v27.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
adds x3,x3,#64
b.eq .Ldone4x
cmp x3,#32
b.lo .Lone
b.eq .Ltwo
.Lthree:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d,v6.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v24.16b,v6.16b,v6.16b,#8
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
eor v6.16b,v6.16b,v24.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
pmull2 v31.1q,v20.2d,v24.2d
pmull v30.1q,v21.1d,v6.1d
eor v0.16b,v0.16b,v18.16b
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
eor v5.16b,v5.16b,v23.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull2 v23.1q,v22.2d,v23.2d
eor v16.16b,v4.16b,v0.16b
pmull2 v5.1q,v21.2d,v5.2d
ext v3.16b,v16.16b,v16.16b,#8
eor v29.16b,v29.16b,v7.16b
eor v31.16b,v31.16b,v23.16b
eor v30.16b,v30.16b,v5.16b
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v26.2d,v3.2d
pmull v1.1q,v27.1d,v16.1d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b .Ldone4x
.align 4
.Ltwo:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
ext v23.16b,v5.16b,v5.16b,#8
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
eor v5.16b,v5.16b,v23.16b
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull2 v31.1q,v20.2d,v23.2d
pmull v30.1q,v21.1d,v5.1d
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v22.2d,v3.2d
pmull2 v1.1q,v21.2d,v16.2d
eor v0.16b,v0.16b,v29.16b
eor v2.16b,v2.16b,v31.16b
eor v1.16b,v1.16b,v30.16b
b .Ldone4x
.align 4
.Lone:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __AARCH64EB__
rev64 v4.16b,v4.16b
#endif
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
eor v16.16b,v4.16b,v0.16b
ext v3.16b,v16.16b,v16.16b,#8
pmull v0.1q,v20.1d,v3.1d
eor v16.16b,v16.16b,v3.16b
pmull2 v2.1q,v20.2d,v3.2d
pmull v1.1q,v21.1d,v16.1d
.Ldone4x:
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
ins v2.d[0],v1.d[1]
ins v1.d[1],v0.d[0]
eor v0.16b,v1.16b,v18.16b
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
pmull v0.1q,v0.1d,v19.1d
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
st1 {v0.2d},[x0] //write out Xi
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

View File

@@ -0,0 +1,247 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
.text
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
.quad 0,0,0,0,0,0,0,0
.type iotas_hw,%object
iotas_hw:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas_hw,.-iotas_hw
.type KeccakF1600_int,%function
.align 5
KeccakF1600_int:
AARCH64_SIGN_LINK_REGISTER
adr x28,iotas_hw
stp x28,x30,[sp,#16] // 32 bytes on top are mine
b .Loop
.align 4
.Loop:
////////////////////////////////////////// Theta
eor x26,x0,x5
stp x4,x9,[sp,#0] // offload pair...
eor x27,x1,x6
eor x28,x2,x7
eor x30,x3,x8
eor x4,x4,x9
eor x26,x26,x10
eor x27,x27,x11
eor x28,x28,x12
eor x30,x30,x13
eor x4,x4,x14
eor x26,x26,x15
eor x27,x27,x16
eor x28,x28,x17
eor x30,x30,x25
eor x4,x4,x19
eor x26,x26,x20
eor x28,x28,x22
eor x27,x27,x21
eor x30,x30,x23
eor x4,x4,x24
eor x9,x26,x28,ror#63
eor x1,x1,x9
eor x6,x6,x9
eor x11,x11,x9
eor x16,x16,x9
eor x21,x21,x9
eor x9,x27,x30,ror#63
eor x28,x28,x4,ror#63
eor x30,x30,x26,ror#63
eor x4,x4,x27,ror#63
eor x27, x2,x9 // mov x27,x2
eor x7,x7,x9
eor x12,x12,x9
eor x17,x17,x9
eor x22,x22,x9
eor x0,x0,x4
eor x5,x5,x4
eor x10,x10,x4
eor x15,x15,x4
eor x20,x20,x4
ldp x4,x9,[sp,#0] // re-load offloaded data
eor x26, x3,x28 // mov x26,x3
eor x8,x8,x28
eor x13,x13,x28
eor x25,x25,x28
eor x23,x23,x28
eor x28, x4,x30 // mov x28,x4
eor x9,x9,x30
eor x14,x14,x30
eor x19,x19,x30
eor x24,x24,x30
////////////////////////////////////////// Rho+Pi
mov x30,x1
ror x1,x6,#20
//mov x27,x2
ror x2,x12,#21
//mov x26,x3
ror x3,x25,#43
//mov x28,x4
ror x4,x24,#50
ror x6,x9,#44
ror x12,x13,#39
ror x25,x17,#49
ror x24,x21,#62
ror x9,x22,#3
ror x13,x19,#56
ror x17,x11,#54
ror x21,x8,#9
ror x22,x14,#25
ror x19,x23,#8
ror x11,x7,#58
ror x8,x16,#19
ror x14,x20,#46
ror x23,x15,#23
ror x7,x10,#61
ror x16,x5,#28
ror x5,x26,#36
ror x10,x30,#63
ror x15,x28,#37
ror x20,x27,#2
////////////////////////////////////////// Chi+Iota
bic x26,x2,x1
bic x27,x3,x2
bic x28,x0,x4
bic x30,x1,x0
eor x0,x0,x26
bic x26,x4,x3
eor x1,x1,x27
ldr x27,[sp,#16]
eor x3,x3,x28
eor x4,x4,x30
eor x2,x2,x26
ldr x30,[x27],#8 // Iota[i++]
bic x26,x7,x6
tst x27,#255 // are we done?
str x27,[sp,#16]
bic x27,x8,x7
bic x28,x5,x9
eor x0,x0,x30 // A[0][0] ^= Iota
bic x30,x6,x5
eor x5,x5,x26
bic x26,x9,x8
eor x6,x6,x27
eor x8,x8,x28
eor x9,x9,x30
eor x7,x7,x26
bic x26,x12,x11
bic x27,x13,x12
bic x28,x10,x14
bic x30,x11,x10
eor x10,x10,x26
bic x26,x14,x13
eor x11,x11,x27
eor x13,x13,x28
eor x14,x14,x30
eor x12,x12,x26
bic x26,x17,x16
bic x27,x25,x17
bic x28,x15,x19
bic x30,x16,x15
eor x15,x15,x26
bic x26,x19,x25
eor x16,x16,x27
eor x25,x25,x28
eor x19,x19,x30
eor x17,x17,x26
bic x26,x22,x21
bic x27,x23,x22
bic x28,x20,x24
bic x30,x21,x20
eor x20,x20,x26
bic x26,x24,x23
eor x21,x21,x27
eor x23,x23,x28
eor x24,x24,x30
eor x22,x22,x26
bne .Loop
ldr x30,[sp,#24]
AARCH64_VALIDATE_LINK_REGISTER
ret
.size KeccakF1600_int,.-KeccakF1600_int
.globl KeccakF1600_hw
.hidden KeccakF1600_hw
.type KeccakF1600_hw,%function
.align 5
KeccakF1600_hw:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#48
str x0,[sp,#32] // offload argument
mov x26,x0
ldp x0,x1,[x0,#16*0]
ldp x2,x3,[x26,#16*1]
ldp x4,x5,[x26,#16*2]
ldp x6,x7,[x26,#16*3]
ldp x8,x9,[x26,#16*4]
ldp x10,x11,[x26,#16*5]
ldp x12,x13,[x26,#16*6]
ldp x14,x15,[x26,#16*7]
ldp x16,x17,[x26,#16*8]
ldp x25,x19,[x26,#16*9]
ldp x20,x21,[x26,#16*10]
ldp x22,x23,[x26,#16*11]
ldr x24,[x26,#16*12]
bl KeccakF1600_int
ldr x26,[sp,#32]
stp x0,x1,[x26,#16*0]
stp x2,x3,[x26,#16*1]
stp x4,x5,[x26,#16*2]
stp x6,x7,[x26,#16*3]
stp x8,x9,[x26,#16*4]
stp x10,x11,[x26,#16*5]
stp x12,x13,[x26,#16*6]
stp x14,x15,[x26,#16*7]
stp x16,x17,[x26,#16*8]
stp x25,x19,[x26,#16*9]
stp x20,x21,[x26,#16*10]
stp x22,x23,[x26,#16*11]
str x24,[x26,#16*12]
ldp x19,x20,[x29,#16]
add sp,sp,#48
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.size KeccakF1600_hw,.-KeccakF1600_hw
.byte 75,101,99,99,97,107,45,49,54,48,48,32,112,101,114,109,117,116,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

View File

@@ -0,0 +1,687 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
.text
.globl md5_block_asm_data_order
.hidden md5_block_asm_data_order
.type md5_block_asm_data_order,@function
md5_block_asm_data_order:
.cfi_startproc
// Save all callee-saved registers
stp x19,x20,[sp,#-80]!
.cfi_def_cfa_offset 80
.cfi_offset x19, -80
.cfi_offset x20, -72
stp x21,x22,[sp,#16]
.cfi_offset x21, -64
.cfi_offset x22, -56
stp x23,x24,[sp,#32]
.cfi_offset x23, -48
.cfi_offset x24, -40
stp x25,x26,[sp,#48]
.cfi_offset x25, -32
.cfi_offset x26, -24
stp x27,x28,[sp,#64]
.cfi_offset x27, -16
.cfi_offset x28, -8
ldp w10, w11, [x0, #0] // .Load MD5 state->A and state->B
ldp w12, w13, [x0, #8] // .Load MD5 state->C and state->D
.align 5
.Lmd5_blocks_loop:
eor x17, x12, x13 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
and x16, x17, x11 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
ldp x15, x3, [x1] // .Load 4 words of input data0 M[0]/0
eor x14, x16, x13 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0xa478 // .Load lower half of constant 0xd76aa478
movk x9, #0xd76a, lsl #16 // .Load upper half of constant 0xd76aa478
add w8, w10, w15 // Add dest value
add w7, w8, w9 // Add constant 0xd76aa478
add w6, w7, w14 // Add aux function result
ror w6, w6, #25 // Rotate left s=7 bits
eor x5, x11, x12 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w11, w6 // Add X parameter round 1 A=FF(A, B, C, D, 0xd76aa478, s=7, M[0])
and x8, x5, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x17, x8, x12 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0xb756 // .Load lower half of constant 0xe8c7b756
movk x16, #0xe8c7, lsl #16 // .Load upper half of constant 0xe8c7b756
lsr x20, x15, #32 // Right shift high input value containing M[1]
add w9, w13, w20 // Add dest value
add w7, w9, w16 // Add constant 0xe8c7b756
add w14, w7, w17 // Add aux function result
ror w14, w14, #20 // Rotate left s=12 bits
eor x6, x4, x11 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w5, w4, w14 // Add X parameter round 1 D=FF(D, A, B, C, 0xe8c7b756, s=12, M[1])
and x8, x6, x5 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x8, x11 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0x70db // .Load lower half of constant 0x242070db
movk x16, #0x2420, lsl #16 // .Load upper half of constant 0x242070db
add w7, w12, w3 // Add dest value
add w17, w7, w16 // Add constant 0x242070db
add w14, w17, w9 // Add aux function result
ror w14, w14, #15 // Rotate left s=17 bits
eor x6, x5, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w5, w14 // Add X parameter round 1 C=FF(C, D, A, B, 0x242070db, s=17, M[2])
and x7, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x16, x7, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0xceee // .Load lower half of constant 0xc1bdceee
movk x9, #0xc1bd, lsl #16 // .Load upper half of constant 0xc1bdceee
lsr x21, x3, #32 // Right shift high input value containing M[3]
add w14, w11, w21 // Add dest value
add w6, w14, w9 // Add constant 0xc1bdceee
add w7, w6, w16 // Add aux function result
ror w7, w7, #10 // Rotate left s=22 bits
eor x17, x8, x5 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w9, w8, w7 // Add X parameter round 1 B=FF(B, C, D, A, 0xc1bdceee, s=22, M[3])
ldp x14, x7, [x1, #16] // .Load 4 words of input data0 M[4]/0w
and x16, x17, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x16, x5 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x16, #0xfaf // .Load lower half of constant 0xf57c0faf
movk x16, #0xf57c, lsl #16 // .Load upper half of constant 0xf57c0faf
add w17, w4, w14 // Add dest value
add w16, w17, w16 // Add constant 0xf57c0faf
add w4, w16, w6 // Add aux function result
ror w4, w4, #25 // Rotate left s=7 bits
eor x16, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0xf57c0faf, s=7, M[4])
and x16, x16, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x16, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x4, #0xc62a // .Load lower half of constant 0x4787c62a
movk x4, #0x4787, lsl #16 // .Load upper half of constant 0x4787c62a
lsr x22, x14, #32 // Right shift high input value containing M[5]
add w16, w5, w22 // Add dest value
add w16, w16, w4 // Add constant 0x4787c62a
add w5, w16, w6 // Add aux function result
ror w5, w5, #20 // Rotate left s=12 bits
eor x4, x17, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w19, w17, w5 // Add X parameter round 1 D=FF(D, A, B, C, 0x4787c62a, s=12, M[5])
and x6, x4, x19 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x5, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x4, #0x4613 // .Load lower half of constant 0xa8304613
movk x4, #0xa830, lsl #16 // .Load upper half of constant 0xa8304613
add w6, w8, w7 // Add dest value
add w8, w6, w4 // Add constant 0xa8304613
add w4, w8, w5 // Add aux function result
ror w4, w4, #15 // Rotate left s=17 bits
eor x6, x19, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w19, w4 // Add X parameter round 1 C=FF(C, D, A, B, 0xa8304613, s=17, M[6])
and x5, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x4, x5, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x6, #0x9501 // .Load lower half of constant 0xfd469501
movk x6, #0xfd46, lsl #16 // .Load upper half of constant 0xfd469501
lsr x23, x7, #32 // Right shift high input value containing M[7]
add w9, w9, w23 // Add dest value
add w5, w9, w6 // Add constant 0xfd469501
add w9, w5, w4 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
eor x6, x8, x19 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0xfd469501, s=22, M[7])
ldp x5, x16, [x1, #32] // .Load 4 words of input data0 M[8]/0
and x9, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x9, x19 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x9, #0x98d8 // .Load lower half of constant 0x698098d8
movk x9, #0x6980, lsl #16 // .Load upper half of constant 0x698098d8
add w17, w17, w5 // Add dest value
add w9, w17, w9 // Add constant 0x698098d8
add w17, w9, w6 // Add aux function result
ror w17, w17, #25 // Rotate left s=7 bits
eor x9, x4, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w6, w4, w17 // Add X parameter round 1 A=FF(A, B, C, D, 0x698098d8, s=7, M[8])
and x17, x9, x6 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x17, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x17, #0xf7af // .Load lower half of constant 0x8b44f7af
movk x17, #0x8b44, lsl #16 // .Load upper half of constant 0x8b44f7af
lsr x24, x5, #32 // Right shift high input value containing M[9]
add w19, w19, w24 // Add dest value
add w17, w19, w17 // Add constant 0x8b44f7af
add w19, w17, w9 // Add aux function result
ror w19, w19, #20 // Rotate left s=12 bits
eor x9, x6, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w6, w19 // Add X parameter round 1 D=FF(D, A, B, C, 0x8b44f7af, s=12, M[9])
and x9, x9, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x9, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x11, #0x5bb1 // .Load lower half of constant 0xffff5bb1
movk x11, #0xffff, lsl #16 // .Load upper half of constant 0xffff5bb1
add w8, w8, w16 // Add dest value
add w8, w8, w11 // Add constant 0xffff5bb1
add w8, w8, w9 // Add aux function result
ror w8, w8, #15 // Rotate left s=17 bits
eor x9, x17, x6 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xffff5bb1, s=17, M[10])
and x9, x9, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x9, x9, x6 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x11, #0xd7be // .Load lower half of constant 0x895cd7be
movk x11, #0x895c, lsl #16 // .Load upper half of constant 0x895cd7be
lsr x25, x16, #32 // Right shift high input value containing M[11]
add w4, w4, w25 // Add dest value
add w4, w4, w11 // Add constant 0x895cd7be
add w9, w4, w9 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
eor x4, x8, x17 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x895cd7be, s=22, M[11])
ldp x11, x12, [x1, #48] // .Load 4 words of input data0 M[12]/0
and x4, x4, x9 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x4, x4, x17 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x19, #0x1122 // .Load lower half of constant 0x6b901122
movk x19, #0x6b90, lsl #16 // .Load upper half of constant 0x6b901122
add w6, w6, w11 // Add dest value
add w6, w6, w19 // Add constant 0x6b901122
add w4, w6, w4 // Add aux function result
ror w4, w4, #25 // Rotate left s=7 bits
eor x6, x9, x8 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w4, w9, w4 // Add X parameter round 1 A=FF(A, B, C, D, 0x6b901122, s=7, M[12])
and x6, x6, x4 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x8 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x19, #0x7193 // .Load lower half of constant 0xfd987193
movk x19, #0xfd98, lsl #16 // .Load upper half of constant 0xfd987193
lsr x26, x11, #32 // Right shift high input value containing M[13]
add w17, w17, w26 // Add dest value
add w17, w17, w19 // Add constant 0xfd987193
add w17, w17, w6 // Add aux function result
ror w17, w17, #20 // Rotate left s=12 bits
eor x6, x4, x9 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w17, w4, w17 // Add X parameter round 1 D=FF(D, A, B, C, 0xfd987193, s=12, M[13])
and x6, x6, x17 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x9 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x13, #0x438e // .Load lower half of constant 0xa679438e
movk x13, #0xa679, lsl #16 // .Load upper half of constant 0xa679438e
add w8, w8, w12 // Add dest value
add w8, w8, w13 // Add constant 0xa679438e
add w8, w8, w6 // Add aux function result
ror w8, w8, #15 // Rotate left s=17 bits
eor x6, x17, x4 // Begin aux function round 1 F(x,y,z)=(((y^z)&x)^z)
add w8, w17, w8 // Add X parameter round 1 C=FF(C, D, A, B, 0xa679438e, s=17, M[14])
and x6, x6, x8 // Continue aux function round 1 F(x,y,z)=(((y^z)&x)^z)
eor x6, x6, x4 // End aux function round 1 F(x,y,z)=(((y^z)&x)^z)
movz x13, #0x821 // .Load lower half of constant 0x49b40821
movk x13, #0x49b4, lsl #16 // .Load upper half of constant 0x49b40821
lsr x27, x12, #32 // Right shift high input value containing M[15]
add w9, w9, w27 // Add dest value
add w9, w9, w13 // Add constant 0x49b40821
add w9, w9, w6 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
movz x13, #0x2562 // .Load lower half of constant 0xf61e2562
movk x13, #0xf61e, lsl #16 // .Load upper half of constant 0xf61e2562
add w4, w4, w20 // Add dest value
add w4, w4, w13 // Add constant 0xf61e2562
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
movz x13, #0xb340 // .Load lower half of constant 0xc040b340
movk x13, #0xc040, lsl #16 // .Load upper half of constant 0xc040b340
add w17, w17, w7 // Add dest value
add w17, w17, w13 // Add constant 0xc040b340
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
movz x13, #0x5a51 // .Load lower half of constant 0x265e5a51
movk x13, #0x265e, lsl #16 // .Load upper half of constant 0x265e5a51
add w8, w8, w25 // Add dest value
add w8, w8, w13 // Add constant 0x265e5a51
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
movz x13, #0xc7aa // .Load lower half of constant 0xe9b6c7aa
movk x13, #0xe9b6, lsl #16 // .Load upper half of constant 0xe9b6c7aa
add w9, w9, w15 // Add dest value
add w9, w9, w13 // Add constant 0xe9b6c7aa
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
movz x13, #0x105d // .Load lower half of constant 0xd62f105d
movk x13, #0xd62f, lsl #16 // .Load upper half of constant 0xd62f105d
add w4, w4, w22 // Add dest value
add w4, w4, w13 // Add constant 0xd62f105d
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
movz x13, #0x1453 // .Load lower half of constant 0x2441453
movk x13, #0x244, lsl #16 // .Load upper half of constant 0x2441453
add w17, w17, w16 // Add dest value
add w17, w17, w13 // Add constant 0x2441453
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
movz x13, #0xe681 // .Load lower half of constant 0xd8a1e681
movk x13, #0xd8a1, lsl #16 // .Load upper half of constant 0xd8a1e681
add w8, w8, w27 // Add dest value
add w8, w8, w13 // Add constant 0xd8a1e681
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
movz x13, #0xfbc8 // .Load lower half of constant 0xe7d3fbc8
movk x13, #0xe7d3, lsl #16 // .Load upper half of constant 0xe7d3fbc8
add w9, w9, w14 // Add dest value
add w9, w9, w13 // Add constant 0xe7d3fbc8
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
movz x13, #0xcde6 // .Load lower half of constant 0x21e1cde6
movk x13, #0x21e1, lsl #16 // .Load upper half of constant 0x21e1cde6
add w4, w4, w24 // Add dest value
add w4, w4, w13 // Add constant 0x21e1cde6
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
movz x13, #0x7d6 // .Load lower half of constant 0xc33707d6
movk x13, #0xc337, lsl #16 // .Load upper half of constant 0xc33707d6
add w17, w17, w12 // Add dest value
add w17, w17, w13 // Add constant 0xc33707d6
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
movz x13, #0xd87 // .Load lower half of constant 0xf4d50d87
movk x13, #0xf4d5, lsl #16 // .Load upper half of constant 0xf4d50d87
add w8, w8, w21 // Add dest value
add w8, w8, w13 // Add constant 0xf4d50d87
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
movz x13, #0x14ed // .Load lower half of constant 0x455a14ed
movk x13, #0x455a, lsl #16 // .Load upper half of constant 0x455a14ed
add w9, w9, w5 // Add dest value
add w9, w9, w13 // Add constant 0x455a14ed
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
movz x13, #0xe905 // .Load lower half of constant 0xa9e3e905
movk x13, #0xa9e3, lsl #16 // .Load upper half of constant 0xa9e3e905
add w4, w4, w26 // Add dest value
add w4, w4, w13 // Add constant 0xa9e3e905
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
movz x13, #0xa3f8 // .Load lower half of constant 0xfcefa3f8
movk x13, #0xfcef, lsl #16 // .Load upper half of constant 0xfcefa3f8
add w17, w17, w3 // Add dest value
add w17, w17, w13 // Add constant 0xfcefa3f8
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
movz x13, #0x2d9 // .Load lower half of constant 0x676f02d9
movk x13, #0x676f, lsl #16 // .Load upper half of constant 0x676f02d9
add w8, w8, w23 // Add dest value
add w8, w8, w13 // Add constant 0x676f02d9
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
movz x13, #0x4c8a // .Load lower half of constant 0x8d2a4c8a
movk x13, #0x8d2a, lsl #16 // .Load upper half of constant 0x8d2a4c8a
add w9, w9, w11 // Add dest value
add w9, w9, w13 // Add constant 0x8d2a4c8a
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #12 // Rotate left s=20 bits
movz x10, #0x3942 // .Load lower half of constant 0xfffa3942
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x8d2a4c8a, s=20, M[12])
movk x10, #0xfffa, lsl #16 // .Load upper half of constant 0xfffa3942
add w4, w4, w22 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xfffa3942
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0xf681 // .Load lower half of constant 0x8771f681
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xfffa3942, s=4, M[5])
movk x10, #0x8771, lsl #16 // .Load upper half of constant 0x8771f681
add w17, w17, w5 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0x8771f681
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x6122 // .Load lower half of constant 0x6d9d6122
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x8771f681, s=11, M[8])
movk x13, #0x6d9d, lsl #16 // .Load upper half of constant 0x6d9d6122
add w8, w8, w25 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0x6d9d6122
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x380c // .Load lower half of constant 0xfde5380c
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x6d9d6122, s=16, M[11])
movk x13, #0xfde5, lsl #16 // .Load upper half of constant 0xfde5380c
add w9, w9, w12 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xfde5380c
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0xea44 // .Load lower half of constant 0xa4beea44
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xfde5380c, s=23, M[14])
movk x10, #0xa4be, lsl #16 // .Load upper half of constant 0xa4beea44
add w4, w4, w20 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xa4beea44
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0xcfa9 // .Load lower half of constant 0x4bdecfa9
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xa4beea44, s=4, M[1])
movk x10, #0x4bde, lsl #16 // .Load upper half of constant 0x4bdecfa9
add w17, w17, w14 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0x4bdecfa9
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x4b60 // .Load lower half of constant 0xf6bb4b60
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0x4bdecfa9, s=11, M[4])
movk x13, #0xf6bb, lsl #16 // .Load upper half of constant 0xf6bb4b60
add w8, w8, w23 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0xf6bb4b60
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0xbc70 // .Load lower half of constant 0xbebfbc70
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xf6bb4b60, s=16, M[7])
movk x13, #0xbebf, lsl #16 // .Load upper half of constant 0xbebfbc70
add w9, w9, w16 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xbebfbc70
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0x7ec6 // .Load lower half of constant 0x289b7ec6
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xbebfbc70, s=23, M[10])
movk x10, #0x289b, lsl #16 // .Load upper half of constant 0x289b7ec6
add w4, w4, w26 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0x289b7ec6
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0x27fa // .Load lower half of constant 0xeaa127fa
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0x289b7ec6, s=4, M[13])
movk x10, #0xeaa1, lsl #16 // .Load upper half of constant 0xeaa127fa
add w17, w17, w15 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0xeaa127fa
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x3085 // .Load lower half of constant 0xd4ef3085
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xeaa127fa, s=11, M[0])
movk x13, #0xd4ef, lsl #16 // .Load upper half of constant 0xd4ef3085
add w8, w8, w21 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0xd4ef3085
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x1d05 // .Load lower half of constant 0x4881d05
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0xd4ef3085, s=16, M[3])
movk x13, #0x488, lsl #16 // .Load upper half of constant 0x4881d05
add w9, w9, w7 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0x4881d05
add w9, w9, w6 // Add aux function result
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #9 // Rotate left s=23 bits
movz x10, #0xd039 // .Load lower half of constant 0xd9d4d039
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0x4881d05, s=23, M[6])
movk x10, #0xd9d4, lsl #16 // .Load upper half of constant 0xd9d4d039
add w4, w4, w24 // Add dest value
eor x6, x6, x9 // End aux function round 3 H(x,y,z)=(x^y^z)
add w4, w4, w10 // Add constant 0xd9d4d039
add w4, w4, w6 // Add aux function result
ror w4, w4, #28 // Rotate left s=4 bits
eor x6, x9, x8 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x10, #0x99e5 // .Load lower half of constant 0xe6db99e5
add w4, w9, w4 // Add X parameter round 3 A=HH(A, B, C, D, 0xd9d4d039, s=4, M[9])
movk x10, #0xe6db, lsl #16 // .Load upper half of constant 0xe6db99e5
add w17, w17, w11 // Add dest value
eor x6, x6, x4 // End aux function round 3 H(x,y,z)=(x^y^z)
add w17, w17, w10 // Add constant 0xe6db99e5
add w17, w17, w6 // Add aux function result
eor x6, x4, x9 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w17, w17, #21 // Rotate left s=11 bits
movz x13, #0x7cf8 // .Load lower half of constant 0x1fa27cf8
add w17, w4, w17 // Add X parameter round 3 D=HH(D, A, B, C, 0xe6db99e5, s=11, M[12])
movk x13, #0x1fa2, lsl #16 // .Load upper half of constant 0x1fa27cf8
add w8, w8, w27 // Add dest value
eor x6, x6, x17 // End aux function round 3 H(x,y,z)=(x^y^z)
add w8, w8, w13 // Add constant 0x1fa27cf8
add w8, w8, w6 // Add aux function result
ror w8, w8, #16 // Rotate left s=16 bits
eor x6, x17, x4 // Begin aux function round 3 H(x,y,z)=(x^y^z)
movz x13, #0x5665 // .Load lower half of constant 0xc4ac5665
add w8, w17, w8 // Add X parameter round 3 C=HH(C, D, A, B, 0x1fa27cf8, s=16, M[15])
movk x13, #0xc4ac, lsl #16 // .Load upper half of constant 0xc4ac5665
add w9, w9, w3 // Add dest value
eor x6, x6, x8 // End aux function round 3 H(x,y,z)=(x^y^z)
add w9, w9, w13 // Add constant 0xc4ac5665
add w9, w9, w6 // Add aux function result
ror w9, w9, #9 // Rotate left s=23 bits
movz x6, #0x2244 // .Load lower half of constant 0xf4292244
movk x6, #0xf429, lsl #16 // .Load upper half of constant 0xf4292244
add w9, w8, w9 // Add X parameter round 3 B=HH(B, C, D, A, 0xc4ac5665, s=23, M[2])
add w4, w4, w15 // Add dest value
orn x13, x9, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w4, w6 // Add constant 0xf4292244
eor x6, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w4, w6 // Add aux function result
ror w4, w4, #26 // Rotate left s=6 bits
movz x6, #0xff97 // .Load lower half of constant 0x432aff97
movk x6, #0x432a, lsl #16 // .Load upper half of constant 0x432aff97
add w4, w9, w4 // Add X parameter round 4 A=II(A, B, C, D, 0xf4292244, s=6, M[0])
orn x10, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w17, w23 // Add dest value
eor x10, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w17, w6 // Add constant 0x432aff97
add w6, w17, w10 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x17, #0x23a7 // .Load lower half of constant 0xab9423a7
movk x17, #0xab94, lsl #16 // .Load upper half of constant 0xab9423a7
add w6, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x432aff97, s=10, M[7])
add w8, w8, w12 // Add dest value
orn x10, x6, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w17 // Add constant 0xab9423a7
eor x17, x4, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w17 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x17, #0xa039 // .Load lower half of constant 0xfc93a039
movk x17, #0xfc93, lsl #16 // .Load upper half of constant 0xfc93a039
add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xab9423a7, s=15, M[14])
orn x13, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w22 // Add dest value
eor x13, x6, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w17 // Add constant 0xfc93a039
add w17, w9, w13 // Add aux function result
ror w17, w17, #11 // Rotate left s=21 bits
movz x9, #0x59c3 // .Load lower half of constant 0x655b59c3
movk x9, #0x655b, lsl #16 // .Load upper half of constant 0x655b59c3
add w17, w8, w17 // Add X parameter round 4 B=II(B, C, D, A, 0xfc93a039, s=21, M[5])
add w4, w4, w11 // Add dest value
orn x13, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w4, w9 // Add constant 0x655b59c3
eor x4, x8, x13 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w9, w4 // Add aux function result
ror w9, w9, #26 // Rotate left s=6 bits
movz x4, #0xcc92 // .Load lower half of constant 0x8f0ccc92
movk x4, #0x8f0c, lsl #16 // .Load upper half of constant 0x8f0ccc92
add w9, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x655b59c3, s=6, M[12])
orn x10, x9, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w6, w21 // Add dest value
eor x10, x17, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w6, w4 // Add constant 0x8f0ccc92
add w6, w4, w10 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x4, #0xf47d // .Load lower half of constant 0xffeff47d
movk x4, #0xffef, lsl #16 // .Load upper half of constant 0xffeff47d
add w6, w9, w6 // Add X parameter round 4 D=II(D, A, B, C, 0x8f0ccc92, s=10, M[3])
add w8, w8, w16 // Add dest value
orn x10, x6, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w4 // Add constant 0xffeff47d
eor x4, x9, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w8, w4 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x4, #0x5dd1 // .Load lower half of constant 0x85845dd1
movk x4, #0x8584, lsl #16 // .Load upper half of constant 0x85845dd1
add w8, w6, w8 // Add X parameter round 4 C=II(C, D, A, B, 0xffeff47d, s=15, M[10])
orn x10, x8, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w17, w20 // Add dest value
eor x17, x6, x10 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w15, w4 // Add constant 0x85845dd1
add w4, w15, w17 // Add aux function result
ror w4, w4, #11 // Rotate left s=21 bits
movz x15, #0x7e4f // .Load lower half of constant 0x6fa87e4f
movk x15, #0x6fa8, lsl #16 // .Load upper half of constant 0x6fa87e4f
add w17, w8, w4 // Add X parameter round 4 B=II(B, C, D, A, 0x85845dd1, s=21, M[1])
add w4, w9, w5 // Add dest value
orn x9, x17, x6 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w4, w15 // Add constant 0x6fa87e4f
eor x4, x8, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w15, w4 // Add aux function result
ror w9, w9, #26 // Rotate left s=6 bits
movz x15, #0xe6e0 // .Load lower half of constant 0xfe2ce6e0
movk x15, #0xfe2c, lsl #16 // .Load upper half of constant 0xfe2ce6e0
add w4, w17, w9 // Add X parameter round 4 A=II(A, B, C, D, 0x6fa87e4f, s=6, M[8])
orn x9, x4, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w6, w27 // Add dest value
eor x9, x17, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w6, w15 // Add constant 0xfe2ce6e0
add w6, w15, w9 // Add aux function result
ror w6, w6, #22 // Rotate left s=10 bits
movz x9, #0x4314 // .Load lower half of constant 0xa3014314
movk x9, #0xa301, lsl #16 // .Load upper half of constant 0xa3014314
add w15, w4, w6 // Add X parameter round 4 D=II(D, A, B, C, 0xfe2ce6e0, s=10, M[15])
add w6, w8, w7 // Add dest value
orn x7, x15, x17 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w6, w9 // Add constant 0xa3014314
eor x9, x4, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w8, w9 // Add aux function result
ror w6, w6, #17 // Rotate left s=15 bits
movz x7, #0x11a1 // .Load lower half of constant 0x4e0811a1
movk x7, #0x4e08, lsl #16 // .Load upper half of constant 0x4e0811a1
add w8, w15, w6 // Add X parameter round 4 C=II(C, D, A, B, 0xa3014314, s=15, M[6])
orn x9, x8, x4 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w6, w17, w26 // Add dest value
eor x17, x15, x9 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w9, w6, w7 // Add constant 0x4e0811a1
add w7, w9, w17 // Add aux function result
ror w7, w7, #11 // Rotate left s=21 bits
movz x6, #0x7e82 // .Load lower half of constant 0xf7537e82
movk x6, #0xf753, lsl #16 // .Load upper half of constant 0xf7537e82
add w9, w8, w7 // Add X parameter round 4 B=II(B, C, D, A, 0x4e0811a1, s=21, M[13])
add w17, w4, w14 // Add dest value
orn x7, x9, x15 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w14, w17, w6 // Add constant 0xf7537e82
eor x4, x8, x7 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w14, w4 // Add aux function result
ror w17, w17, #26 // Rotate left s=6 bits
movz x6, #0xf235 // .Load lower half of constant 0xbd3af235
movk x6, #0xbd3a, lsl #16 // .Load upper half of constant 0xbd3af235
add w7, w9, w17 // Add X parameter round 4 A=II(A, B, C, D, 0xf7537e82, s=6, M[4])
orn x14, x7, x8 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w4, w15, w25 // Add dest value
eor x17, x9, x14 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w4, w6 // Add constant 0xbd3af235
add w16, w15, w17 // Add aux function result
ror w16, w16, #22 // Rotate left s=10 bits
movz x14, #0xd2bb // .Load lower half of constant 0x2ad7d2bb
movk x14, #0x2ad7, lsl #16 // .Load upper half of constant 0x2ad7d2bb
add w4, w7, w16 // Add X parameter round 4 D=II(D, A, B, C, 0xbd3af235, s=10, M[11])
add w6, w8, w3 // Add dest value
orn x15, x4, x9 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w17, w6, w14 // Add constant 0x2ad7d2bb
eor x16, x7, x15 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w8, w17, w16 // Add aux function result
ror w8, w8, #17 // Rotate left s=15 bits
movz x3, #0xd391 // .Load lower half of constant 0xeb86d391
movk x3, #0xeb86, lsl #16 // .Load upper half of constant 0xeb86d391
add w14, w4, w8 // Add X parameter round 4 C=II(C, D, A, B, 0x2ad7d2bb, s=15, M[2])
orn x6, x14, x7 // Begin aux function round 4 I(x,y,z)=((~z|x)^y)
add w15, w9, w24 // Add dest value
eor x17, x4, x6 // End aux function round 4 I(x,y,z)=((~z|x)^y)
add w16, w15, w3 // Add constant 0xeb86d391
add w8, w16, w17 // Add aux function result
ror w8, w8, #11 // Rotate left s=21 bits
ldp w6, w15, [x0] // Reload MD5 state->A and state->B
ldp w5, w9, [x0, #8] // Reload MD5 state->C and state->D
add w3, w14, w8 // Add X parameter round 4 B=II(B, C, D, A, 0xeb86d391, s=21, M[9])
add w13, w4, w9 // Add result of MD5 rounds to state->D
add w12, w14, w5 // Add result of MD5 rounds to state->C
add w10, w7, w6 // Add result of MD5 rounds to state->A
add w11, w3, w15 // Add result of MD5 rounds to state->B
stp w12, w13, [x0, #8] // Store MD5 states C,D
stp w10, w11, [x0] // Store MD5 states A,B
add x1, x1, #64 // Increment data pointer
subs w2, w2, #1 // Decrement block counter
b.ne .Lmd5_blocks_loop
ldp x21,x22,[sp,#16]
.cfi_restore x21
.cfi_restore x22
ldp x23,x24,[sp,#32]
.cfi_restore x23
.cfi_restore x24
ldp x25,x26,[sp,#48]
.cfi_restore x25
.cfi_restore x26
ldp x27,x28,[sp,#64]
.cfi_restore x27
.cfi_restore x28
ldp x19,x20,[sp],#80
.cfi_restore x19
.cfi_restore x20
.cfi_def_cfa_offset 0
ret
.cfi_endproc
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,309 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include "openssl/arm_arch.h"
.text
.globl beeu_mod_inverse_vartime
.hidden beeu_mod_inverse_vartime
.type beeu_mod_inverse_vartime, %function
.align 4
beeu_mod_inverse_vartime:
// Reserve enough space for 14 8-byte registers on the stack
// in the first stp call for x29, x30.
// Then store the remaining callee-saved registers.
//
// | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
// ^ ^
// sp <------------------- 112 bytes ----------------> old sp
// x29 (FP)
//
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-112]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
stp x0,x2,[sp,#96]
// B = b3..b0 := a
ldp x25,x26,[x1]
ldp x27,x28,[x1,#16]
// n3..n0 := n
// Note: the value of input params are changed in the following.
ldp x0,x1,[x2]
ldp x2,x30,[x2,#16]
// A = a3..a0 := n
mov x21, x0
mov x22, x1
mov x23, x2
mov x24, x30
// X = x4..x0 := 1
mov x3, #1
eor x4, x4, x4
eor x5, x5, x5
eor x6, x6, x6
eor x7, x7, x7
// Y = y4..y0 := 0
eor x8, x8, x8
eor x9, x9, x9
eor x10, x10, x10
eor x11, x11, x11
eor x12, x12, x12
.Lbeeu_loop:
// if B == 0, jump to .Lbeeu_loop_end
orr x14, x25, x26
orr x14, x14, x27
// reverse the bit order of x25. This is needed for clz after this macro
rbit x15, x25
orr x14, x14, x28
cbz x14,.Lbeeu_loop_end
// 0 < B < |n|,
// 0 < A <= |n|,
// (1) X*a == B (mod |n|),
// (2) (-1)*Y*a == A (mod |n|)
// Now divide B by the maximum possible power of two in the
// integers, and divide X by the same value mod |n|.
// When we're done, (1) still holds.
// shift := number of trailing 0s in x25
// ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
clz x13, x15
// If there is no shift, goto shift_A_Y
cbz x13, .Lbeeu_shift_A_Y
// Shift B right by "x13" bits
neg x14, x13
lsr x25, x25, x13
lsl x15, x26, x14
lsr x26, x26, x13
lsl x19, x27, x14
orr x25, x25, x15
lsr x27, x27, x13
lsl x20, x28, x14
orr x26, x26, x19
lsr x28, x28, x13
orr x27, x27, x20
// Shift X right by "x13" bits, adding n whenever X becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
.Lbeeu_shift_loop_X:
tbz x3, #0, .Lshift1_0
adds x3, x3, x0
adcs x4, x4, x1
adcs x5, x5, x2
adcs x6, x6, x30
adc x7, x7, x14
.Lshift1_0:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x3, x4, x3, #1
extr x4, x5, x4, #1
extr x5, x6, x5, #1
extr x6, x7, x6, #1
lsr x7, x7, #1
subs x13, x13, #1
bne .Lbeeu_shift_loop_X
// Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
// with the following differences:
// - "x13" is set directly to the number of trailing 0s in B
// (using rbit and clz instructions)
// - The loop is only used to call SHIFT1(X)
// and x13 is decreased while executing the X loop.
// - SHIFT256(B, x13) is performed before right-shifting X; they are independent
.Lbeeu_shift_A_Y:
// Same for A and Y.
// Afterwards, (2) still holds.
// Reverse the bit order of x21
// x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
rbit x15, x21
clz x13, x15
// If there is no shift, goto |B-A|, X+Y update
cbz x13, .Lbeeu_update_B_X_or_A_Y
// Shift A right by "x13" bits
neg x14, x13
lsr x21, x21, x13
lsl x15, x22, x14
lsr x22, x22, x13
lsl x19, x23, x14
orr x21, x21, x15
lsr x23, x23, x13
lsl x20, x24, x14
orr x22, x22, x19
lsr x24, x24, x13
orr x23, x23, x20
// Shift Y right by "x13" bits, adding n whenever Y becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
.Lbeeu_shift_loop_Y:
tbz x8, #0, .Lshift1_1
adds x8, x8, x0
adcs x9, x9, x1
adcs x10, x10, x2
adcs x11, x11, x30
adc x12, x12, x14
.Lshift1_1:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x8, x9, x8, #1
extr x9, x10, x9, #1
extr x10, x11, x10, #1
extr x11, x12, x11, #1
lsr x12, x12, #1
subs x13, x13, #1
bne .Lbeeu_shift_loop_Y
.Lbeeu_update_B_X_or_A_Y:
// Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
// Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
// without taking a sign bit if generated. The lack of a carry would
// indicate a negative result. See, for example,
// https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
subs x14, x25, x21
sbcs x15, x26, x22
sbcs x19, x27, x23
sbcs x20, x28, x24
bcs .Lbeeu_B_greater_than_A
// Else A > B =>
// A := A - B; Y := Y + X; goto beginning of the loop
subs x21, x21, x25
sbcs x22, x22, x26
sbcs x23, x23, x27
sbcs x24, x24, x28
adds x8, x8, x3
adcs x9, x9, x4
adcs x10, x10, x5
adcs x11, x11, x6
adc x12, x12, x7
b .Lbeeu_loop
.Lbeeu_B_greater_than_A:
// Continue with B > A =>
// B := B - A; X := X + Y; goto beginning of the loop
mov x25, x14
mov x26, x15
mov x27, x19
mov x28, x20
adds x3, x3, x8
adcs x4, x4, x9
adcs x5, x5, x10
adcs x6, x6, x11
adc x7, x7, x12
b .Lbeeu_loop
.Lbeeu_loop_end:
// The Euclid's algorithm loop ends when A == gcd(a,n);
// this would be 1, when a and n are co-prime (i.e. do not have a common factor).
// Since (-1)*Y*a == A (mod |n|), Y>0
// then out = -Y mod n
// Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
// Is A-1 == 0?
// If not, fail.
sub x14, x21, #1
orr x14, x14, x22
orr x14, x14, x23
orr x14, x14, x24
cbnz x14, .Lbeeu_err
// If Y>n ==> Y:=Y-n
.Lbeeu_reduction_loop:
// x_i := y_i - n_i (X is no longer needed, use it as temp)
// (x14 = 0 from above)
subs x3, x8, x0
sbcs x4, x9, x1
sbcs x5, x10, x2
sbcs x6, x11, x30
sbcs x7, x12, x14
// If result is non-negative (i.e., cs = carry set = no borrow),
// y_i := x_i; goto reduce again
// else
// y_i := y_i; continue
csel x8, x3, x8, cs
csel x9, x4, x9, cs
csel x10, x5, x10, cs
csel x11, x6, x11, cs
csel x12, x7, x12, cs
bcs .Lbeeu_reduction_loop
// Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
// out = -Y = n-Y
subs x8, x0, x8
sbcs x9, x1, x9
sbcs x10, x2, x10
sbcs x11, x30, x11
// Save Y in output (out (x0) was saved on the stack)
ldr x3, [sp,#96]
stp x8, x9, [x3]
stp x10, x11, [x3,#16]
// return 1 (success)
mov x0, #1
b .Lbeeu_finish
.Lbeeu_err:
// return 0 (error)
eor x0, x0, x0
.Lbeeu_finish:
// Restore callee-saved registers, except x0, x2
add sp,x29,#0
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldp x29,x30,[sp],#112
AARCH64_VALIDATE_LINK_REGISTER
ret
.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

View File

@@ -0,0 +1,37 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
.arch armv8-a
.text
# int CRYPTO_rndr_multiple8(uint8_t *out, const size_t len)
.globl CRYPTO_rndr_multiple8
.hidden CRYPTO_rndr_multiple8
.type CRYPTO_rndr_multiple8,%function
.align 4
CRYPTO_rndr_multiple8:
cbz x1, .Lrndr_multiple8_error // len = 0 is not supported
.Lrndr_multiple8_loop:
mrs x2, s3_3_c2_c4_0 // rndr instruction https://developer.arm.com/documentation/ddi0601/2024-09/Index-by-Encoding
cbz x2, .Lrndr_multiple8_error // Check if rndr failed
str x2, [x0], #8 // Copy 8 bytes to *out and increment pointer by 8
sub x1, x1, #8
cbz x1, .Lrndr_multiple8_done // If multiple of 8 this will be 0 eventually
b .Lrndr_multiple8_loop
.Lrndr_multiple8_done:
mov x0, #1 // Return value success
ret
.Lrndr_multiple8_error:
mov x0, #0 // Return value error
ret
.size CRYPTO_rndr_multiple8,.-CRYPTO_rndr_multiple8
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,750 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include <openssl/arm_arch.h>
.text
// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
// with |argv|, then saves the callee-saved registers into |state|. It returns
// the result of |func|. The |unwind| argument is unused.
// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
// const uint64_t *argv, size_t argc,
// uint64_t unwind);
.type abi_test_trampoline, %function
.globl abi_test_trampoline
.hidden abi_test_trampoline
.align 4
abi_test_trampoline:
.Labi_test_trampoline_begin:
AARCH64_SIGN_LINK_REGISTER
// Stack layout (low to high addresses)
// x29,x30 (16 bytes)
// d8-d15 (64 bytes)
// x19-x28 (80 bytes)
// x1 (8 bytes)
// padding (8 bytes)
stp x29, x30, [sp, #-176]!
mov x29, sp
// Saved callee-saved registers and |state|.
stp d8, d9, [sp, #16]
stp d10, d11, [sp, #32]
stp d12, d13, [sp, #48]
stp d14, d15, [sp, #64]
stp x19, x20, [sp, #80]
stp x21, x22, [sp, #96]
stp x23, x24, [sp, #112]
stp x25, x26, [sp, #128]
stp x27, x28, [sp, #144]
str x1, [sp, #160]
// Load registers from |state|, with the exception of x29. x29 is the
// frame pointer and also callee-saved, but AAPCS64 allows platforms to
// mandate that x29 always point to a frame. iOS64 does so, which means
// we cannot fill x29 with entropy without violating ABI rules
// ourselves. x29 is tested separately below.
ldp d8, d9, [x1], #16
ldp d10, d11, [x1], #16
ldp d12, d13, [x1], #16
ldp d14, d15, [x1], #16
ldp x19, x20, [x1], #16
ldp x21, x22, [x1], #16
ldp x23, x24, [x1], #16
ldp x25, x26, [x1], #16
ldp x27, x28, [x1], #16
// Move parameters into temporary registers.
mov x9, x0
mov x10, x2
mov x11, x3
// Load parameters into registers.
cbz x11, .Largs_done
ldr x0, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x1, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x2, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x3, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x4, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x5, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x6, [x10], #8
subs x11, x11, #1
b.eq .Largs_done
ldr x7, [x10], #8
.Largs_done:
blr x9
// Reload |state| and store registers.
ldr x1, [sp, #160]
stp d8, d9, [x1], #16
stp d10, d11, [x1], #16
stp d12, d13, [x1], #16
stp d14, d15, [x1], #16
stp x19, x20, [x1], #16
stp x21, x22, [x1], #16
stp x23, x24, [x1], #16
stp x25, x26, [x1], #16
stp x27, x28, [x1], #16
// |func| is required to preserve x29, the frame pointer. We cannot load
// random values into x29 (see comment above), so compare it against the
// expected value and zero the field of |state| if corrupted.
mov x9, sp
cmp x29, x9
b.eq .Lx29_ok
str xzr, [x1]
.Lx29_ok:
// Restore callee-saved registers.
ldp d8, d9, [sp, #16]
ldp d10, d11, [sp, #32]
ldp d12, d13, [sp, #48]
ldp d14, d15, [sp, #64]
ldp x19, x20, [sp, #80]
ldp x21, x22, [sp, #96]
ldp x23, x24, [sp, #112]
ldp x25, x26, [sp, #128]
ldp x27, x28, [sp, #144]
ldp x29, x30, [sp], #176
AARCH64_VALIDATE_LINK_REGISTER
ret
.size abi_test_trampoline,.-abi_test_trampoline
.type abi_test_clobber_x0, %function
.globl abi_test_clobber_x0
.hidden abi_test_clobber_x0
.align 4
abi_test_clobber_x0:
AARCH64_VALID_CALL_TARGET
mov x0, xzr
ret
.size abi_test_clobber_x0,.-abi_test_clobber_x0
.type abi_test_clobber_x1, %function
.globl abi_test_clobber_x1
.hidden abi_test_clobber_x1
.align 4
abi_test_clobber_x1:
AARCH64_VALID_CALL_TARGET
mov x1, xzr
ret
.size abi_test_clobber_x1,.-abi_test_clobber_x1
.type abi_test_clobber_x2, %function
.globl abi_test_clobber_x2
.hidden abi_test_clobber_x2
.align 4
abi_test_clobber_x2:
AARCH64_VALID_CALL_TARGET
mov x2, xzr
ret
.size abi_test_clobber_x2,.-abi_test_clobber_x2
.type abi_test_clobber_x3, %function
.globl abi_test_clobber_x3
.hidden abi_test_clobber_x3
.align 4
abi_test_clobber_x3:
AARCH64_VALID_CALL_TARGET
mov x3, xzr
ret
.size abi_test_clobber_x3,.-abi_test_clobber_x3
.type abi_test_clobber_x4, %function
.globl abi_test_clobber_x4
.hidden abi_test_clobber_x4
.align 4
abi_test_clobber_x4:
AARCH64_VALID_CALL_TARGET
mov x4, xzr
ret
.size abi_test_clobber_x4,.-abi_test_clobber_x4
.type abi_test_clobber_x5, %function
.globl abi_test_clobber_x5
.hidden abi_test_clobber_x5
.align 4
abi_test_clobber_x5:
AARCH64_VALID_CALL_TARGET
mov x5, xzr
ret
.size abi_test_clobber_x5,.-abi_test_clobber_x5
.type abi_test_clobber_x6, %function
.globl abi_test_clobber_x6
.hidden abi_test_clobber_x6
.align 4
abi_test_clobber_x6:
AARCH64_VALID_CALL_TARGET
mov x6, xzr
ret
.size abi_test_clobber_x6,.-abi_test_clobber_x6
.type abi_test_clobber_x7, %function
.globl abi_test_clobber_x7
.hidden abi_test_clobber_x7
.align 4
abi_test_clobber_x7:
AARCH64_VALID_CALL_TARGET
mov x7, xzr
ret
.size abi_test_clobber_x7,.-abi_test_clobber_x7
.type abi_test_clobber_x8, %function
.globl abi_test_clobber_x8
.hidden abi_test_clobber_x8
.align 4
abi_test_clobber_x8:
AARCH64_VALID_CALL_TARGET
mov x8, xzr
ret
.size abi_test_clobber_x8,.-abi_test_clobber_x8
.type abi_test_clobber_x9, %function
.globl abi_test_clobber_x9
.hidden abi_test_clobber_x9
.align 4
abi_test_clobber_x9:
AARCH64_VALID_CALL_TARGET
mov x9, xzr
ret
.size abi_test_clobber_x9,.-abi_test_clobber_x9
.type abi_test_clobber_x10, %function
.globl abi_test_clobber_x10
.hidden abi_test_clobber_x10
.align 4
abi_test_clobber_x10:
AARCH64_VALID_CALL_TARGET
mov x10, xzr
ret
.size abi_test_clobber_x10,.-abi_test_clobber_x10
.type abi_test_clobber_x11, %function
.globl abi_test_clobber_x11
.hidden abi_test_clobber_x11
.align 4
abi_test_clobber_x11:
AARCH64_VALID_CALL_TARGET
mov x11, xzr
ret
.size abi_test_clobber_x11,.-abi_test_clobber_x11
.type abi_test_clobber_x12, %function
.globl abi_test_clobber_x12
.hidden abi_test_clobber_x12
.align 4
abi_test_clobber_x12:
AARCH64_VALID_CALL_TARGET
mov x12, xzr
ret
.size abi_test_clobber_x12,.-abi_test_clobber_x12
.type abi_test_clobber_x13, %function
.globl abi_test_clobber_x13
.hidden abi_test_clobber_x13
.align 4
abi_test_clobber_x13:
AARCH64_VALID_CALL_TARGET
mov x13, xzr
ret
.size abi_test_clobber_x13,.-abi_test_clobber_x13
.type abi_test_clobber_x14, %function
.globl abi_test_clobber_x14
.hidden abi_test_clobber_x14
.align 4
abi_test_clobber_x14:
AARCH64_VALID_CALL_TARGET
mov x14, xzr
ret
.size abi_test_clobber_x14,.-abi_test_clobber_x14
.type abi_test_clobber_x15, %function
.globl abi_test_clobber_x15
.hidden abi_test_clobber_x15
.align 4
abi_test_clobber_x15:
AARCH64_VALID_CALL_TARGET
mov x15, xzr
ret
.size abi_test_clobber_x15,.-abi_test_clobber_x15
.type abi_test_clobber_x16, %function
.globl abi_test_clobber_x16
.hidden abi_test_clobber_x16
.align 4
abi_test_clobber_x16:
AARCH64_VALID_CALL_TARGET
mov x16, xzr
ret
.size abi_test_clobber_x16,.-abi_test_clobber_x16
.type abi_test_clobber_x17, %function
.globl abi_test_clobber_x17
.hidden abi_test_clobber_x17
.align 4
abi_test_clobber_x17:
AARCH64_VALID_CALL_TARGET
mov x17, xzr
ret
.size abi_test_clobber_x17,.-abi_test_clobber_x17
.type abi_test_clobber_x19, %function
.globl abi_test_clobber_x19
.hidden abi_test_clobber_x19
.align 4
abi_test_clobber_x19:
AARCH64_VALID_CALL_TARGET
mov x19, xzr
ret
.size abi_test_clobber_x19,.-abi_test_clobber_x19
.type abi_test_clobber_x20, %function
.globl abi_test_clobber_x20
.hidden abi_test_clobber_x20
.align 4
abi_test_clobber_x20:
AARCH64_VALID_CALL_TARGET
mov x20, xzr
ret
.size abi_test_clobber_x20,.-abi_test_clobber_x20
.type abi_test_clobber_x21, %function
.globl abi_test_clobber_x21
.hidden abi_test_clobber_x21
.align 4
abi_test_clobber_x21:
AARCH64_VALID_CALL_TARGET
mov x21, xzr
ret
.size abi_test_clobber_x21,.-abi_test_clobber_x21
.type abi_test_clobber_x22, %function
.globl abi_test_clobber_x22
.hidden abi_test_clobber_x22
.align 4
abi_test_clobber_x22:
AARCH64_VALID_CALL_TARGET
mov x22, xzr
ret
.size abi_test_clobber_x22,.-abi_test_clobber_x22
.type abi_test_clobber_x23, %function
.globl abi_test_clobber_x23
.hidden abi_test_clobber_x23
.align 4
abi_test_clobber_x23:
AARCH64_VALID_CALL_TARGET
mov x23, xzr
ret
.size abi_test_clobber_x23,.-abi_test_clobber_x23
.type abi_test_clobber_x24, %function
.globl abi_test_clobber_x24
.hidden abi_test_clobber_x24
.align 4
abi_test_clobber_x24:
AARCH64_VALID_CALL_TARGET
mov x24, xzr
ret
.size abi_test_clobber_x24,.-abi_test_clobber_x24
.type abi_test_clobber_x25, %function
.globl abi_test_clobber_x25
.hidden abi_test_clobber_x25
.align 4
abi_test_clobber_x25:
AARCH64_VALID_CALL_TARGET
mov x25, xzr
ret
.size abi_test_clobber_x25,.-abi_test_clobber_x25
.type abi_test_clobber_x26, %function
.globl abi_test_clobber_x26
.hidden abi_test_clobber_x26
.align 4
abi_test_clobber_x26:
AARCH64_VALID_CALL_TARGET
mov x26, xzr
ret
.size abi_test_clobber_x26,.-abi_test_clobber_x26
.type abi_test_clobber_x27, %function
.globl abi_test_clobber_x27
.hidden abi_test_clobber_x27
.align 4
abi_test_clobber_x27:
AARCH64_VALID_CALL_TARGET
mov x27, xzr
ret
.size abi_test_clobber_x27,.-abi_test_clobber_x27
.type abi_test_clobber_x28, %function
.globl abi_test_clobber_x28
.hidden abi_test_clobber_x28
.align 4
abi_test_clobber_x28:
AARCH64_VALID_CALL_TARGET
mov x28, xzr
ret
.size abi_test_clobber_x28,.-abi_test_clobber_x28
.type abi_test_clobber_x29, %function
.globl abi_test_clobber_x29
.hidden abi_test_clobber_x29
.align 4
abi_test_clobber_x29:
AARCH64_VALID_CALL_TARGET
mov x29, xzr
ret
.size abi_test_clobber_x29,.-abi_test_clobber_x29
.type abi_test_clobber_d0, %function
.globl abi_test_clobber_d0
.hidden abi_test_clobber_d0
.align 4
abi_test_clobber_d0:
AARCH64_VALID_CALL_TARGET
fmov d0, xzr
ret
.size abi_test_clobber_d0,.-abi_test_clobber_d0
.type abi_test_clobber_d1, %function
.globl abi_test_clobber_d1
.hidden abi_test_clobber_d1
.align 4
abi_test_clobber_d1:
AARCH64_VALID_CALL_TARGET
fmov d1, xzr
ret
.size abi_test_clobber_d1,.-abi_test_clobber_d1
.type abi_test_clobber_d2, %function
.globl abi_test_clobber_d2
.hidden abi_test_clobber_d2
.align 4
abi_test_clobber_d2:
AARCH64_VALID_CALL_TARGET
fmov d2, xzr
ret
.size abi_test_clobber_d2,.-abi_test_clobber_d2
.type abi_test_clobber_d3, %function
.globl abi_test_clobber_d3
.hidden abi_test_clobber_d3
.align 4
abi_test_clobber_d3:
AARCH64_VALID_CALL_TARGET
fmov d3, xzr
ret
.size abi_test_clobber_d3,.-abi_test_clobber_d3
.type abi_test_clobber_d4, %function
.globl abi_test_clobber_d4
.hidden abi_test_clobber_d4
.align 4
abi_test_clobber_d4:
AARCH64_VALID_CALL_TARGET
fmov d4, xzr
ret
.size abi_test_clobber_d4,.-abi_test_clobber_d4
.type abi_test_clobber_d5, %function
.globl abi_test_clobber_d5
.hidden abi_test_clobber_d5
.align 4
abi_test_clobber_d5:
AARCH64_VALID_CALL_TARGET
fmov d5, xzr
ret
.size abi_test_clobber_d5,.-abi_test_clobber_d5
.type abi_test_clobber_d6, %function
.globl abi_test_clobber_d6
.hidden abi_test_clobber_d6
.align 4
abi_test_clobber_d6:
AARCH64_VALID_CALL_TARGET
fmov d6, xzr
ret
.size abi_test_clobber_d6,.-abi_test_clobber_d6
.type abi_test_clobber_d7, %function
.globl abi_test_clobber_d7
.hidden abi_test_clobber_d7
.align 4
abi_test_clobber_d7:
AARCH64_VALID_CALL_TARGET
fmov d7, xzr
ret
.size abi_test_clobber_d7,.-abi_test_clobber_d7
.type abi_test_clobber_d8, %function
.globl abi_test_clobber_d8
.hidden abi_test_clobber_d8
.align 4
abi_test_clobber_d8:
AARCH64_VALID_CALL_TARGET
fmov d8, xzr
ret
.size abi_test_clobber_d8,.-abi_test_clobber_d8
.type abi_test_clobber_d9, %function
.globl abi_test_clobber_d9
.hidden abi_test_clobber_d9
.align 4
abi_test_clobber_d9:
AARCH64_VALID_CALL_TARGET
fmov d9, xzr
ret
.size abi_test_clobber_d9,.-abi_test_clobber_d9
.type abi_test_clobber_d10, %function
.globl abi_test_clobber_d10
.hidden abi_test_clobber_d10
.align 4
abi_test_clobber_d10:
AARCH64_VALID_CALL_TARGET
fmov d10, xzr
ret
.size abi_test_clobber_d10,.-abi_test_clobber_d10
.type abi_test_clobber_d11, %function
.globl abi_test_clobber_d11
.hidden abi_test_clobber_d11
.align 4
abi_test_clobber_d11:
AARCH64_VALID_CALL_TARGET
fmov d11, xzr
ret
.size abi_test_clobber_d11,.-abi_test_clobber_d11
.type abi_test_clobber_d12, %function
.globl abi_test_clobber_d12
.hidden abi_test_clobber_d12
.align 4
abi_test_clobber_d12:
AARCH64_VALID_CALL_TARGET
fmov d12, xzr
ret
.size abi_test_clobber_d12,.-abi_test_clobber_d12
.type abi_test_clobber_d13, %function
.globl abi_test_clobber_d13
.hidden abi_test_clobber_d13
.align 4
abi_test_clobber_d13:
AARCH64_VALID_CALL_TARGET
fmov d13, xzr
ret
.size abi_test_clobber_d13,.-abi_test_clobber_d13
.type abi_test_clobber_d14, %function
.globl abi_test_clobber_d14
.hidden abi_test_clobber_d14
.align 4
abi_test_clobber_d14:
AARCH64_VALID_CALL_TARGET
fmov d14, xzr
ret
.size abi_test_clobber_d14,.-abi_test_clobber_d14
.type abi_test_clobber_d15, %function
.globl abi_test_clobber_d15
.hidden abi_test_clobber_d15
.align 4
abi_test_clobber_d15:
AARCH64_VALID_CALL_TARGET
fmov d15, xzr
ret
.size abi_test_clobber_d15,.-abi_test_clobber_d15
.type abi_test_clobber_d16, %function
.globl abi_test_clobber_d16
.hidden abi_test_clobber_d16
.align 4
abi_test_clobber_d16:
AARCH64_VALID_CALL_TARGET
fmov d16, xzr
ret
.size abi_test_clobber_d16,.-abi_test_clobber_d16
.type abi_test_clobber_d17, %function
.globl abi_test_clobber_d17
.hidden abi_test_clobber_d17
.align 4
abi_test_clobber_d17:
AARCH64_VALID_CALL_TARGET
fmov d17, xzr
ret
.size abi_test_clobber_d17,.-abi_test_clobber_d17
.type abi_test_clobber_d18, %function
.globl abi_test_clobber_d18
.hidden abi_test_clobber_d18
.align 4
abi_test_clobber_d18:
AARCH64_VALID_CALL_TARGET
fmov d18, xzr
ret
.size abi_test_clobber_d18,.-abi_test_clobber_d18
.type abi_test_clobber_d19, %function
.globl abi_test_clobber_d19
.hidden abi_test_clobber_d19
.align 4
abi_test_clobber_d19:
AARCH64_VALID_CALL_TARGET
fmov d19, xzr
ret
.size abi_test_clobber_d19,.-abi_test_clobber_d19
.type abi_test_clobber_d20, %function
.globl abi_test_clobber_d20
.hidden abi_test_clobber_d20
.align 4
abi_test_clobber_d20:
AARCH64_VALID_CALL_TARGET
fmov d20, xzr
ret
.size abi_test_clobber_d20,.-abi_test_clobber_d20
.type abi_test_clobber_d21, %function
.globl abi_test_clobber_d21
.hidden abi_test_clobber_d21
.align 4
abi_test_clobber_d21:
AARCH64_VALID_CALL_TARGET
fmov d21, xzr
ret
.size abi_test_clobber_d21,.-abi_test_clobber_d21
.type abi_test_clobber_d22, %function
.globl abi_test_clobber_d22
.hidden abi_test_clobber_d22
.align 4
abi_test_clobber_d22:
AARCH64_VALID_CALL_TARGET
fmov d22, xzr
ret
.size abi_test_clobber_d22,.-abi_test_clobber_d22
.type abi_test_clobber_d23, %function
.globl abi_test_clobber_d23
.hidden abi_test_clobber_d23
.align 4
abi_test_clobber_d23:
AARCH64_VALID_CALL_TARGET
fmov d23, xzr
ret
.size abi_test_clobber_d23,.-abi_test_clobber_d23
.type abi_test_clobber_d24, %function
.globl abi_test_clobber_d24
.hidden abi_test_clobber_d24
.align 4
abi_test_clobber_d24:
AARCH64_VALID_CALL_TARGET
fmov d24, xzr
ret
.size abi_test_clobber_d24,.-abi_test_clobber_d24
.type abi_test_clobber_d25, %function
.globl abi_test_clobber_d25
.hidden abi_test_clobber_d25
.align 4
abi_test_clobber_d25:
AARCH64_VALID_CALL_TARGET
fmov d25, xzr
ret
.size abi_test_clobber_d25,.-abi_test_clobber_d25
.type abi_test_clobber_d26, %function
.globl abi_test_clobber_d26
.hidden abi_test_clobber_d26
.align 4
abi_test_clobber_d26:
AARCH64_VALID_CALL_TARGET
fmov d26, xzr
ret
.size abi_test_clobber_d26,.-abi_test_clobber_d26
.type abi_test_clobber_d27, %function
.globl abi_test_clobber_d27
.hidden abi_test_clobber_d27
.align 4
abi_test_clobber_d27:
AARCH64_VALID_CALL_TARGET
fmov d27, xzr
ret
.size abi_test_clobber_d27,.-abi_test_clobber_d27
.type abi_test_clobber_d28, %function
.globl abi_test_clobber_d28
.hidden abi_test_clobber_d28
.align 4
abi_test_clobber_d28:
AARCH64_VALID_CALL_TARGET
fmov d28, xzr
ret
.size abi_test_clobber_d28,.-abi_test_clobber_d28
.type abi_test_clobber_d29, %function
.globl abi_test_clobber_d29
.hidden abi_test_clobber_d29
.align 4
abi_test_clobber_d29:
AARCH64_VALID_CALL_TARGET
fmov d29, xzr
ret
.size abi_test_clobber_d29,.-abi_test_clobber_d29
.type abi_test_clobber_d30, %function
.globl abi_test_clobber_d30
.hidden abi_test_clobber_d30
.align 4
abi_test_clobber_d30:
AARCH64_VALID_CALL_TARGET
fmov d30, xzr
ret
.size abi_test_clobber_d30,.-abi_test_clobber_d30
.type abi_test_clobber_d31, %function
.globl abi_test_clobber_d31
.hidden abi_test_clobber_d31
.align 4
abi_test_clobber_d31:
AARCH64_VALID_CALL_TARGET
fmov d31, xzr
ret
.size abi_test_clobber_d31,.-abi_test_clobber_d31
.type abi_test_clobber_v8_upper, %function
.globl abi_test_clobber_v8_upper
.hidden abi_test_clobber_v8_upper
.align 4
abi_test_clobber_v8_upper:
AARCH64_VALID_CALL_TARGET
fmov v8.d[1], xzr
ret
.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper
.type abi_test_clobber_v9_upper, %function
.globl abi_test_clobber_v9_upper
.hidden abi_test_clobber_v9_upper
.align 4
abi_test_clobber_v9_upper:
AARCH64_VALID_CALL_TARGET
fmov v9.d[1], xzr
ret
.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper
.type abi_test_clobber_v10_upper, %function
.globl abi_test_clobber_v10_upper
.hidden abi_test_clobber_v10_upper
.align 4
abi_test_clobber_v10_upper:
AARCH64_VALID_CALL_TARGET
fmov v10.d[1], xzr
ret
.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper
.type abi_test_clobber_v11_upper, %function
.globl abi_test_clobber_v11_upper
.hidden abi_test_clobber_v11_upper
.align 4
abi_test_clobber_v11_upper:
AARCH64_VALID_CALL_TARGET
fmov v11.d[1], xzr
ret
.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper
.type abi_test_clobber_v12_upper, %function
.globl abi_test_clobber_v12_upper
.hidden abi_test_clobber_v12_upper
.align 4
abi_test_clobber_v12_upper:
AARCH64_VALID_CALL_TARGET
fmov v12.d[1], xzr
ret
.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper
.type abi_test_clobber_v13_upper, %function
.globl abi_test_clobber_v13_upper
.hidden abi_test_clobber_v13_upper
.align 4
abi_test_clobber_v13_upper:
AARCH64_VALID_CALL_TARGET
fmov v13.d[1], xzr
ret
.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper
.type abi_test_clobber_v14_upper, %function
.globl abi_test_clobber_v14_upper
.hidden abi_test_clobber_v14_upper
.align 4
abi_test_clobber_v14_upper:
AARCH64_VALID_CALL_TARGET
fmov v14.d[1], xzr
ret
.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper
.type abi_test_clobber_v15_upper, %function
.globl abi_test_clobber_v15_upper
.hidden abi_test_clobber_v15_upper
.align 4
abi_test_clobber_v15_upper:
AARCH64_VALID_CALL_TARGET
fmov v15.d[1], xzr
ret
.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)