8364 lines
280 KiB
ArmAsm
8364 lines
280 KiB
ArmAsm
// This file is generated from a similarly-named Perl script in the BoringSSL
|
|
// source tree. Do not edit by hand.
|
|
|
|
#include <openssl/asm_base.h>
|
|
|
|
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
|
|
#include "openssl/arm_arch.h"
|
|
|
|
#if __ARM_MAX_ARCH__>=8
|
|
.text
|
|
.arch armv8.2-a+crypto
|
|
.globl aesv8_gcm_8x_enc_128
|
|
.hidden aesv8_gcm_8x_enc_128
|
|
.type aesv8_gcm_8x_enc_128,%function
|
|
.align 4
|
|
aesv8_gcm_8x_enc_128:
|
|
#ifdef BORINGSSL_DISPATCH_TEST
|
|
|
|
adrp x9,BORINGSSL_function_hit
|
|
add x9, x9, :lo12:BORINGSSL_function_hit
|
|
mov w10, #1
|
|
strb w10, [x9,#7] // kFlag_aesv8_gcm_8x_enc_128
|
|
#endif
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L128_enc_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
mov x5, x9
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
ldr q27, [x11, #160] //load rk10
|
|
|
|
aese v3.16b, v26.16b //AES block 8k+11 - round 9
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
aese v2.16b, v26.16b //AES block 8k+10 - round 9
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v6.16b, v26.16b //AES block 8k+14 - round 9
|
|
|
|
aese v4.16b, v26.16b //AES block 8k+12 - round 9
|
|
add x5, x5, x0
|
|
aese v0.16b, v26.16b //AES block 8k+8 - round 9
|
|
|
|
aese v7.16b, v26.16b //AES block 8k+15 - round 9
|
|
aese v5.16b, v26.16b //AES block 8k+13 - round 9
|
|
aese v1.16b, v26.16b //AES block 8k+9 - round 9
|
|
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
b.ge .L128_enc_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
|
|
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
|
|
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
|
|
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
|
|
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
|
|
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
|
|
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
|
|
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
|
|
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result
|
|
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
|
|
|
|
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
b.ge .L128_enc_prepretail //do prepretail
|
|
|
|
.L128_enc_main_loop: //main loop start
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h3l | h3h
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h1l | h1h
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ldr q27, [x11, #160] //load rk10
|
|
|
|
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
|
|
aese v2.16b, v26.16b //AES block 8k+10 - round 9
|
|
aese v4.16b, v26.16b //AES block 8k+12 - round 9
|
|
aese v1.16b, v26.16b //AES block 8k+9 - round 9
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
|
|
aese v7.16b, v26.16b //AES block 8k+15 - round 9
|
|
|
|
aese v6.16b, v26.16b //AES block 8k+14 - round 9
|
|
aese v3.16b, v26.16b //AES block 8k+11 - round 9
|
|
|
|
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
|
|
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
aese v0.16b, v26.16b //AES block 8k+8 - round 9
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
|
|
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
|
|
aese v5.16b, v26.16b //AES block 8k+13 - round 9
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
|
|
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
|
|
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
|
|
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
|
|
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
|
|
|
|
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
|
|
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
|
|
b.lt .L128_enc_main_loop
|
|
|
|
.L128_enc_prepretail: //PREPRETAIL
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h6k | h5k
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h1l | h1h
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
|
|
ldr q27, [x11, #160] //load rk10
|
|
aese v6.16b, v26.16b //AES block 8k+14 - round 9
|
|
aese v2.16b, v26.16b //AES block 8k+10 - round 9
|
|
|
|
aese v0.16b, v26.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v26.16b //AES block 8k+9 - round 9
|
|
|
|
aese v3.16b, v26.16b //AES block 8k+11 - round 9
|
|
aese v5.16b, v26.16b //AES block 8k+13 - round 9
|
|
|
|
aese v4.16b, v26.16b //AES block 8k+12 - round 9
|
|
aese v7.16b, v26.16b //AES block 8k+15 - round 9
|
|
.L128_enc_tail: //TAIL
|
|
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
|
|
|
|
mov v29.16b, v27.16b
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
|
|
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
|
|
ldp q24, q25, [x6, #160] //load h8k | h7k
|
|
cmp x5, #112
|
|
b.gt .L128_enc_blocks_more_than_7
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
movi v17.8b, #0
|
|
|
|
cmp x5, #96
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v2.16b
|
|
mov v2.16b, v1.16b
|
|
|
|
movi v19.8b, #0
|
|
movi v18.8b, #0
|
|
b.gt .L128_enc_blocks_more_than_6
|
|
|
|
mov v7.16b, v6.16b
|
|
cmp x5, #80
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v1.16b
|
|
b.gt .L128_enc_blocks_more_than_5
|
|
|
|
cmp x5, #64
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v1.16b
|
|
b.gt .L128_enc_blocks_more_than_4
|
|
|
|
mov v7.16b, v6.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v1.16b
|
|
cmp x5, #48
|
|
b.gt .L128_enc_blocks_more_than_3
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v1.16b
|
|
|
|
cmp x5, #32
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
b.gt .L128_enc_blocks_more_than_2
|
|
|
|
cmp x5, #16
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v1.16b
|
|
b.gt .L128_enc_blocks_more_than_1
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b .L128_enc_blocks_less_than_1
|
|
.L128_enc_blocks_more_than_7: //blocks left > 7
|
|
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
ldr q9, [x0], #16 //AES final-6 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
.L128_enc_blocks_more_than_6: //blocks left > 6
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
ldr q9, [x0], #16 //AES final-5 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
|
|
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
.L128_enc_blocks_more_than_5: //blocks left > 5
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
ldr q9, [x0], #16 //AES final-4 block - load plaintext
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
|
|
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
.L128_enc_blocks_more_than_4: //blocks left > 4
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
ldr q9, [x0], #16 //AES final-3 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
|
|
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
.L128_enc_blocks_more_than_3: //blocks left > 3
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
|
|
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
|
|
ldr q9, [x0], #16 //AES final-2 block - load plaintext
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
|
|
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
.L128_enc_blocks_more_than_2: //blocks left > 2
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ldr q9, [x0], #16 //AES final-1 block - load plaintext
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
.L128_enc_blocks_more_than_1: //blocks left > 1
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
|
|
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
ldr q9, [x0], #16 //AES final block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
.L128_enc_blocks_less_than_1: //blocks left <= 1
|
|
|
|
rev32 v30.16b, v30.16b
|
|
str q30, [x16] //store the updated counter
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
cmp x1, #64
|
|
|
|
csel x13, x8, x7, lt
|
|
csel x14, x7, xzr, lt
|
|
|
|
mov v0.d[1], x14
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
|
|
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
st1 { v9.16b}, [x2] //store all 16B
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
ldr q20, [x6] //load h1l | h1h
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
mov x0, x9
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
|
|
.L128_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_enc_128,.-aesv8_gcm_8x_enc_128
|
|
.globl aesv8_gcm_8x_dec_128
|
|
.hidden aesv8_gcm_8x_dec_128
|
|
.type aesv8_gcm_8x_dec_128,%function
|
|
.align 4
|
|
aesv8_gcm_8x_dec_128:
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L128_dec_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
mov x5, x9
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
|
|
add x5, x5, x0
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 8
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 8
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 8
|
|
|
|
aese v0.16b, v26.16b //AES block 0 - round 9
|
|
aese v1.16b, v26.16b //AES block 1 - round 9
|
|
aese v6.16b, v26.16b //AES block 6 - round 9
|
|
|
|
ldr q27, [x11, #160] //load rk10
|
|
aese v4.16b, v26.16b //AES block 4 - round 9
|
|
aese v3.16b, v26.16b //AES block 3 - round 9
|
|
|
|
aese v2.16b, v26.16b //AES block 2 - round 9
|
|
aese v5.16b, v26.16b //AES block 5 - round 9
|
|
aese v7.16b, v26.16b //AES block 7 - round 9
|
|
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
b.ge .L128_dec_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
|
|
|
|
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
|
|
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
|
|
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
|
|
|
|
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
|
|
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
|
|
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
|
|
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
|
|
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
|
|
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
|
|
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
|
|
|
|
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
|
|
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
b.ge .L128_dec_prepretail //do prepretail
|
|
|
|
.L128_dec_main_loop: //main loop start
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
|
|
aese v0.16b, v26.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v26.16b //AES block 8k+9 - round 9
|
|
ldr q27, [x11, #160] //load rk10
|
|
|
|
aese v6.16b, v26.16b //AES block 8k+14 - round 9
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v2.16b, v26.16b //AES block 8k+10 - round 9
|
|
|
|
aese v7.16b, v26.16b //AES block 8k+15 - round 9
|
|
aese v4.16b, v26.16b //AES block 8k+12 - round 9
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
|
|
aese v3.16b, v26.16b //AES block 8k+11 - round 9
|
|
aese v5.16b, v26.16b //AES block 8k+13 - round 9
|
|
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
|
|
|
|
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
|
|
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result
|
|
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result
|
|
|
|
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
|
|
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
|
|
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
|
|
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
|
|
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
|
|
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
|
|
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
b.lt .L128_dec_main_loop
|
|
|
|
.L128_dec_prepretail: //PREPRETAIL
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
ldr q27, [x11, #160] //load rk10
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
|
|
aese v6.16b, v26.16b //AES block 8k+14 - round 9
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
aese v2.16b, v26.16b //AES block 8k+10 - round 9
|
|
|
|
aese v3.16b, v26.16b //AES block 8k+11 - round 9
|
|
aese v5.16b, v26.16b //AES block 8k+13 - round 9
|
|
aese v0.16b, v26.16b //AES block 8k+8 - round 9
|
|
|
|
aese v4.16b, v26.16b //AES block 8k+12 - round 9
|
|
aese v1.16b, v26.16b //AES block 8k+9 - round 9
|
|
aese v7.16b, v26.16b //AES block 8k+15 - round 9
|
|
|
|
.L128_dec_tail: //TAIL
|
|
|
|
mov v29.16b, v27.16b
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
|
|
cmp x5, #112
|
|
|
|
ldp q24, q25, [x6, #160] //load h8k | h7k
|
|
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
|
|
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
|
|
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
b.gt .L128_dec_blocks_more_than_7
|
|
|
|
cmp x5, #96
|
|
mov v7.16b, v6.16b
|
|
movi v19.8b, #0
|
|
|
|
movi v17.8b, #0
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v2.16b
|
|
mov v2.16b, v1.16b
|
|
|
|
movi v18.8b, #0
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L128_dec_blocks_more_than_6
|
|
|
|
cmp x5, #80
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v1.16b
|
|
b.gt .L128_dec_blocks_more_than_5
|
|
|
|
cmp x5, #64
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L128_dec_blocks_more_than_4
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v1.16b
|
|
cmp x5, #48
|
|
b.gt .L128_dec_blocks_more_than_3
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
cmp x5, #32
|
|
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
mov v6.16b, v1.16b
|
|
b.gt .L128_dec_blocks_more_than_2
|
|
|
|
cmp x5, #16
|
|
|
|
mov v7.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L128_dec_blocks_more_than_1
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
b .L128_dec_blocks_less_than_1
|
|
.L128_dec_blocks_more_than_7: //blocks left > 7
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
|
|
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
.L128_dec_blocks_more_than_6: //blocks left > 6
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
.L128_dec_blocks_more_than_5: //blocks left > 5
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
|
|
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
|
|
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
.L128_dec_blocks_more_than_4: //blocks left > 4
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
|
|
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
.L128_dec_blocks_more_than_3: //blocks left > 3
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
.L128_dec_blocks_more_than_2: //blocks left > 2
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
|
|
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
.L128_dec_blocks_more_than_1: //blocks left > 1
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final block - load ciphertext
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
.L128_dec_blocks_less_than_1: //blocks left <= 1
|
|
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
cmp x1, #64
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
|
|
csel x13, x8, x7, lt
|
|
csel x14, x7, xzr, lt
|
|
|
|
mov v0.d[1], x14
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
|
|
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
st1 { v12.16b}, [x2] //store all 16B
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
|
|
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
|
|
|
|
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
rev32 v30.16b, v30.16b
|
|
|
|
str q30, [x16] //store the updated counter
|
|
|
|
mov x0, x9
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
.L128_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_dec_128,.-aesv8_gcm_8x_dec_128
|
|
.globl aesv8_gcm_8x_enc_192
|
|
.hidden aesv8_gcm_8x_enc_192
|
|
.type aesv8_gcm_8x_enc_192,%function
|
|
.align 4
|
|
aesv8_gcm_8x_enc_192:
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L192_enc_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
mov x5, x9
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
add x5, x5, x0
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 8
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 8
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 8
|
|
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 9
|
|
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 9
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 9
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 9
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 9
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 9
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 14 - round 10
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 9
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 11 - round 10
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 9 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 13 - round 10
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 12 - round 10
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 10 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 15 - round 10
|
|
|
|
aese v6.16b, v28.16b //AES block 14 - round 11
|
|
aese v3.16b, v28.16b //AES block 11 - round 11
|
|
|
|
aese v4.16b, v28.16b //AES block 12 - round 11
|
|
aese v7.16b, v28.16b //AES block 15 - round 11
|
|
ldr q26, [x11, #192] //load rk12
|
|
|
|
aese v1.16b, v28.16b //AES block 9 - round 11
|
|
aese v5.16b, v28.16b //AES block 13 - round 11
|
|
|
|
aese v2.16b, v28.16b //AES block 10 - round 11
|
|
aese v0.16b, v28.16b //AES block 8 - round 11
|
|
b.ge .L192_enc_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
|
|
|
|
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
|
|
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
|
|
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
|
|
|
|
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
|
|
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
|
|
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
|
|
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
|
|
|
|
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
|
|
b.ge .L192_enc_prepretail //do prepretail
|
|
|
|
.L192_enc_main_loop: //main loop start
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
ldr q26, [x11, #192] //load rk12
|
|
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
|
|
|
|
aese v4.16b, v28.16b //AES block 8k+12 - round 11
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
|
|
aese v2.16b, v28.16b //AES block 8k+10 - round 11
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
aese v5.16b, v28.16b //AES block 8k+13 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
|
|
aese v7.16b, v28.16b //AES block 8k+15 - round 11
|
|
aese v0.16b, v28.16b //AES block 8k+8 - round 11
|
|
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
|
|
|
|
aese v6.16b, v28.16b //AES block 8k+14 - round 11
|
|
aese v3.16b, v28.16b //AES block 8k+11 - round 11
|
|
aese v1.16b, v28.16b //AES block 8k+9 - round 11
|
|
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
|
|
|
|
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
|
|
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
|
|
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
|
|
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
|
|
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
|
|
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
|
|
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
|
|
b.lt .L192_enc_main_loop
|
|
|
|
.L192_enc_prepretail: //PREPRETAIL
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ldr q26, [x11, #192] //load rk12
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
|
|
aese v1.16b, v28.16b //AES block 8k+9 - round 11
|
|
aese v7.16b, v28.16b //AES block 8k+15 - round 11
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v3.16b, v28.16b //AES block 8k+11 - round 11
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
aese v2.16b, v28.16b //AES block 8k+10 - round 11
|
|
aese v0.16b, v28.16b //AES block 8k+8 - round 11
|
|
|
|
aese v6.16b, v28.16b //AES block 8k+14 - round 11
|
|
aese v4.16b, v28.16b //AES block 8k+12 - round 11
|
|
aese v5.16b, v28.16b //AES block 8k+13 - round 11
|
|
|
|
.L192_enc_tail: //TAIL
|
|
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
|
|
ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext
|
|
|
|
ldp q24, q25, [x6, #160] //load h8k | h7k
|
|
|
|
mov v29.16b, v26.16b
|
|
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
cmp x5, #112
|
|
|
|
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
b.gt .L192_enc_blocks_more_than_7
|
|
|
|
cmp x5, #96
|
|
mov v7.16b, v6.16b
|
|
movi v17.8b, #0
|
|
|
|
mov v6.16b, v5.16b
|
|
movi v19.8b, #0
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v2.16b
|
|
|
|
mov v2.16b, v1.16b
|
|
movi v18.8b, #0
|
|
b.gt .L192_enc_blocks_more_than_6
|
|
|
|
mov v7.16b, v6.16b
|
|
cmp x5, #80
|
|
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
|
|
mov v3.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L192_enc_blocks_more_than_5
|
|
|
|
cmp x5, #64
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v1.16b
|
|
b.gt .L192_enc_blocks_more_than_4
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v1.16b
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
cmp x5, #48
|
|
b.gt .L192_enc_blocks_more_than_3
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
cmp x5, #32
|
|
b.gt .L192_enc_blocks_more_than_2
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
cmp x5, #16
|
|
mov v7.16b, v1.16b
|
|
b.gt .L192_enc_blocks_more_than_1
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
b .L192_enc_blocks_less_than_1
|
|
.L192_enc_blocks_more_than_7: //blocks left > 7
|
|
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final-6 block - load plaintext
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
.L192_enc_blocks_more_than_6: //blocks left > 6
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
|
|
ldr q9, [x0], #16 //AES final-5 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
.L192_enc_blocks_more_than_5: //blocks left > 5
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final-4 block - load plaintext
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
|
|
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
.L192_enc_blocks_more_than_4: //blocks left > 4
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ldr q9, [x0], #16 //AES final-3 block - load plaintext
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
.L192_enc_blocks_more_than_3: //blocks left > 3
|
|
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ldr q9, [x0], #16 //AES final-2 block - load plaintext
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
|
|
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
.L192_enc_blocks_more_than_2: //blocks left > 2
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ldr q9, [x0], #16 //AES final-1 block - load plaintext
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
.L192_enc_blocks_more_than_1: //blocks left > 1
|
|
|
|
ldr q22, [x6, #32] //load h1l | h1h
|
|
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final block - load plaintext
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
|
|
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
.L192_enc_blocks_less_than_1: //blocks left <= 1
|
|
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
cmp x1, #64
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
|
|
csel x13, x8, x7, lt
|
|
csel x14, x7, xzr, lt
|
|
|
|
mov v0.d[1], x14
|
|
ldr q20, [x6] //load h1l | h1h
|
|
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
|
|
st1 { v9.16b}, [x2] //store all 16B
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
rev32 v30.16b, v30.16b
|
|
|
|
str q30, [x16] //store the updated counter
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
|
|
mov x0, x9 //return sizes
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
|
|
.L192_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_enc_192,.-aesv8_gcm_8x_enc_192
|
|
.globl aesv8_gcm_8x_dec_192
|
|
.hidden aesv8_gcm_8x_dec_192
|
|
.type aesv8_gcm_8x_dec_192,%function
|
|
.align 4
|
|
aesv8_gcm_8x_dec_192:
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L192_dec_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
mov x5, x9
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
ld1 { v19.16b}, [x3]
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 8
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 8
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 8
|
|
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 9
|
|
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 9
|
|
add x5, x5, x0
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 9
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 9
|
|
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 9
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 9
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 9
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 10
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 10
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 10
|
|
ldr q26, [x11, #192] //load rk12
|
|
|
|
aese v0.16b, v28.16b //AES block 0 - round 11
|
|
aese v1.16b, v28.16b //AES block 1 - round 11
|
|
aese v4.16b, v28.16b //AES block 4 - round 11
|
|
|
|
aese v6.16b, v28.16b //AES block 6 - round 11
|
|
aese v5.16b, v28.16b //AES block 5 - round 11
|
|
aese v7.16b, v28.16b //AES block 7 - round 11
|
|
|
|
aese v2.16b, v28.16b //AES block 2 - round 11
|
|
aese v3.16b, v28.16b //AES block 3 - round 11
|
|
b.ge .L192_dec_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
|
|
|
|
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
|
|
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
|
|
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
|
|
|
|
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
|
|
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
|
|
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
|
|
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
|
|
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
|
|
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
|
|
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
|
|
b.ge .L192_dec_prepretail //do prepretail
|
|
|
|
.L192_dec_main_loop: //main loop start
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
|
|
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
|
|
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
ldr q26, [x11, #192] //load rk12
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
|
|
aese v0.16b, v28.16b //AES block 8k+8 - round 11
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
aese v1.16b, v28.16b //AES block 8k+9 - round 11
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
aese v6.16b, v28.16b //AES block 8k+14 - round 11
|
|
aese v3.16b, v28.16b //AES block 8k+11 - round 11
|
|
|
|
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
|
|
aese v4.16b, v28.16b //AES block 8k+12 - round 11
|
|
aese v2.16b, v28.16b //AES block 8k+10 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
|
|
aese v7.16b, v28.16b //AES block 8k+15 - round 11
|
|
aese v5.16b, v28.16b //AES block 8k+13 - round 11
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
|
|
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
|
|
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
|
|
|
|
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
|
|
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result
|
|
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
|
|
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
|
|
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result
|
|
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
|
|
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result
|
|
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
b.lt .L192_dec_main_loop
|
|
|
|
.L192_dec_prepretail: //PREPRETAIL
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ldr q26, [x11, #192] //load rk12
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
|
|
aese v0.16b, v28.16b //AES block 8k+8 - round 11
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
aese v5.16b, v28.16b //AES block 8k+13 - round 11
|
|
|
|
aese v2.16b, v28.16b //AES block 8k+10 - round 11
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
|
|
aese v6.16b, v28.16b //AES block 8k+14 - round 11
|
|
aese v4.16b, v28.16b //AES block 8k+12 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
aese v3.16b, v28.16b //AES block 8k+11 - round 11
|
|
aese v1.16b, v28.16b //AES block 8k+9 - round 11
|
|
aese v7.16b, v28.16b //AES block 8k+15 - round 11
|
|
|
|
.L192_dec_tail: //TAIL
|
|
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
|
|
|
|
ldp q24, q25, [x6, #160] //load h8k | h7k
|
|
|
|
mov v29.16b, v26.16b
|
|
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
|
|
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
cmp x5, #112
|
|
b.gt .L192_dec_blocks_more_than_7
|
|
|
|
mov v7.16b, v6.16b
|
|
movi v17.8b, #0
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
|
|
cmp x5, #96
|
|
movi v19.8b, #0
|
|
mov v3.16b, v2.16b
|
|
|
|
mov v2.16b, v1.16b
|
|
movi v18.8b, #0
|
|
b.gt .L192_dec_blocks_more_than_6
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v1.16b
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
cmp x5, #80
|
|
b.gt .L192_dec_blocks_more_than_5
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v1.16b
|
|
cmp x5, #64
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L192_dec_blocks_more_than_4
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v1.16b
|
|
cmp x5, #48
|
|
b.gt .L192_dec_blocks_more_than_3
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
cmp x5, #32
|
|
|
|
mov v6.16b, v1.16b
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
b.gt .L192_dec_blocks_more_than_2
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v7.16b, v1.16b
|
|
cmp x5, #16
|
|
b.gt .L192_dec_blocks_more_than_1
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
b .L192_dec_blocks_less_than_1
|
|
.L192_dec_blocks_more_than_7: //blocks left > 7
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
|
|
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
|
|
|
|
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
.L192_dec_blocks_more_than_6: //blocks left > 6
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
|
|
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
.L192_dec_blocks_more_than_5: //blocks left > 5
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
|
|
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
.L192_dec_blocks_more_than_4: //blocks left > 4
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
|
|
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
.L192_dec_blocks_more_than_3: //blocks left > 3
|
|
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
|
|
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
.L192_dec_blocks_more_than_2: //blocks left > 2
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
|
|
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
.L192_dec_blocks_more_than_1: //blocks left > 1
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
ldr q9, [x0], #16 //AES final block - load ciphertext
|
|
ldr q22, [x6, #32] //load h1l | h1h
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
|
|
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
|
|
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
.L192_dec_blocks_less_than_1: //blocks left <= 1
|
|
|
|
rev32 v30.16b, v30.16b
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
str q30, [x16] //store the updated counter
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
cmp x1, #64
|
|
|
|
csel x13, x8, x7, lt
|
|
csel x14, x7, xzr, lt
|
|
ldr q20, [x6] //load h1l | h1h
|
|
|
|
mov v0.d[1], x14
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
|
|
st1 { v12.16b}, [x2] //store all 16B
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
|
|
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
|
|
|
|
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
|
|
mov x0, x9
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
|
|
.L192_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_dec_192,.-aesv8_gcm_8x_dec_192
|
|
.globl aesv8_gcm_8x_enc_256
|
|
.hidden aesv8_gcm_8x_enc_256
|
|
.type aesv8_gcm_8x_enc_256,%function
|
|
.align 4
|
|
aesv8_gcm_8x_enc_256:
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L256_enc_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
|
|
mov x5, x9
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
|
|
add x5, x5, x0
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 8
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 8
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 8
|
|
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 9
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 9
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 9
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 9
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 9
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 9
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 10
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 10
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 9
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 10
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 10
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 11
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 11
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 11
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 11
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 11
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 11
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 11
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 11
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
ldr q28, [x11, #224] //load rk14
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 12
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 12
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 12
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 12
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 12
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 12
|
|
|
|
aese v2.16b, v27.16b //AES block 2 - round 13
|
|
aese v1.16b, v27.16b //AES block 1 - round 13
|
|
aese v4.16b, v27.16b //AES block 4 - round 13
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 12
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 12
|
|
|
|
aese v0.16b, v27.16b //AES block 0 - round 13
|
|
aese v5.16b, v27.16b //AES block 5 - round 13
|
|
|
|
aese v6.16b, v27.16b //AES block 6 - round 13
|
|
aese v7.16b, v27.16b //AES block 7 - round 13
|
|
aese v3.16b, v27.16b //AES block 3 - round 13
|
|
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
b.ge .L256_enc_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
|
|
|
|
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
|
|
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
|
|
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
|
|
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
|
|
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
|
|
|
|
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
|
|
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
|
|
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
|
|
|
|
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
|
|
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
b.ge .L256_enc_prepretail //do prepretail
|
|
|
|
.L256_enc_main_loop: //main loop start
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
|
|
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
|
|
ldr q28, [x11, #224] //load rk14
|
|
aese v7.16b, v27.16b //AES block 8k+15 - round 13
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
|
|
aese v2.16b, v27.16b //AES block 8k+10 - round 13
|
|
aese v4.16b, v27.16b //AES block 8k+12 - round 13
|
|
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
aese v5.16b, v27.16b //AES block 8k+13 - round 13
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
|
|
aese v3.16b, v27.16b //AES block 8k+11 - round 13
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
|
|
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
|
|
aese v0.16b, v27.16b //AES block 8k+8 - round 13
|
|
aese v6.16b, v27.16b //AES block 8k+14 - round 13
|
|
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
|
|
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v1.16b, v27.16b //AES block 8k+9 - round 13
|
|
|
|
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
|
|
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
|
|
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
|
|
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
|
|
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
|
|
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
|
|
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
|
|
|
|
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
b.lt .L256_enc_main_loop
|
|
|
|
.L256_enc_prepretail: //PREPRETAIL
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
|
|
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
|
|
ldr q28, [x11, #224] //load rk14
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
|
|
aese v0.16b, v27.16b //AES block 8k+8 - round 13
|
|
|
|
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
|
|
aese v5.16b, v27.16b //AES block 8k+13 - round 13
|
|
aese v1.16b, v27.16b //AES block 8k+9 - round 13
|
|
|
|
aese v3.16b, v27.16b //AES block 8k+11 - round 13
|
|
aese v4.16b, v27.16b //AES block 8k+12 - round 13
|
|
aese v7.16b, v27.16b //AES block 8k+15 - round 13
|
|
|
|
aese v2.16b, v27.16b //AES block 8k+10 - round 13
|
|
aese v6.16b, v27.16b //AES block 8k+14 - round 13
|
|
.L256_enc_tail: //TAIL
|
|
|
|
ldp q24, q25, [x6, #160] //load h8l | h8h
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
|
|
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
|
|
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
mov v29.16b, v28.16b
|
|
|
|
cmp x5, #112
|
|
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
b.gt .L256_enc_blocks_more_than_7
|
|
|
|
movi v19.8b, #0
|
|
mov v7.16b, v6.16b
|
|
movi v17.8b, #0
|
|
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
|
|
mov v3.16b, v2.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v2.16b, v1.16b
|
|
|
|
movi v18.8b, #0
|
|
cmp x5, #96
|
|
b.gt .L256_enc_blocks_more_than_6
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
cmp x5, #80
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v1.16b
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L256_enc_blocks_more_than_5
|
|
|
|
mov v7.16b, v6.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v4.16b
|
|
|
|
cmp x5, #64
|
|
mov v4.16b, v1.16b
|
|
b.gt .L256_enc_blocks_more_than_4
|
|
|
|
cmp x5, #48
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L256_enc_blocks_more_than_3
|
|
|
|
cmp x5, #32
|
|
mov v7.16b, v6.16b
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
mov v6.16b, v1.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
b.gt .L256_enc_blocks_more_than_2
|
|
|
|
mov v7.16b, v1.16b
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
cmp x5, #16
|
|
b.gt .L256_enc_blocks_more_than_1
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
b .L256_enc_blocks_less_than_1
|
|
.L256_enc_blocks_more_than_7: //blocks left > 7
|
|
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ldr q9, [x0], #16 //AES final-6 block - load plaintext
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
.L256_enc_blocks_more_than_6: //blocks left > 6
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
|
|
ldr q9, [x0], #16 //AES final-5 block - load plaintext
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
.L256_enc_blocks_more_than_5: //blocks left > 5
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final-4 block - load plaintext
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
.L256_enc_blocks_more_than_4: //blocks left > 4
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
ldr q9, [x0], #16 //AES final-3 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
|
|
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
.L256_enc_blocks_more_than_3: //blocks left > 3
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
|
|
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
ldr q9, [x0], #16 //AES final-2 block - load plaintext
|
|
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
|
|
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
.L256_enc_blocks_more_than_2: //blocks left > 2
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
ldr q9, [x0], #16 //AES final-1 block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
.L256_enc_blocks_more_than_1: //blocks left > 1
|
|
|
|
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
|
|
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
ldr q9, [x0], #16 //AES final block - load plaintext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
|
|
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
.L256_enc_blocks_less_than_1: //blocks left <= 1
|
|
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
cmp x1, #64
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
|
|
csel x14, x7, xzr, lt
|
|
csel x13, x8, x7, lt
|
|
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
ldr q20, [x6] //load h1l | h1h
|
|
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
mov v0.d[1], x14
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
|
|
rev32 v30.16b, v30.16b
|
|
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
str q30, [x16] //store the updated counter
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
st1 { v9.16b}, [x2] //store all 16B
|
|
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
|
|
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
mov x0, x9 //return sizes
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
|
|
.L256_enc_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_enc_256,.-aesv8_gcm_8x_enc_256
|
|
.globl aesv8_gcm_8x_dec_256
|
|
.hidden aesv8_gcm_8x_dec_256
|
|
.type aesv8_gcm_8x_dec_256,%function
|
|
.align 4
|
|
aesv8_gcm_8x_dec_256:
|
|
AARCH64_VALID_CALL_TARGET
|
|
cbz x1, .L256_dec_ret
|
|
stp d8, d9, [sp, #-80]!
|
|
lsr x9, x1, #3
|
|
mov x16, x4
|
|
mov x11, x5
|
|
stp d10, d11, [sp, #16]
|
|
stp d12, d13, [sp, #32]
|
|
stp d14, d15, [sp, #48]
|
|
mov x5, #0xc200000000000000
|
|
stp x5, xzr, [sp, #64]
|
|
add x10, sp, #64
|
|
|
|
ld1 { v0.16b}, [x16] //CTR block 0
|
|
|
|
mov x15, #0x100000000 //set up counter increment
|
|
movi v31.16b, #0x0
|
|
mov v31.d[1], x15
|
|
mov x5, x9
|
|
|
|
sub x5, x5, #1 //byte_len - 1
|
|
|
|
rev32 v30.16b, v0.16b //set up reversed counter
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 0
|
|
|
|
rev32 v1.16b, v30.16b //CTR block 1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 1
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 2
|
|
add v30.4s, v30.4s, v31.4s //CTR block 2
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 3
|
|
add v30.4s, v30.4s, v31.4s //CTR block 3
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 4
|
|
add v30.4s, v30.4s, v31.4s //CTR block 4
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 0
|
|
|
|
rev32 v5.16b, v30.16b //CTR block 5
|
|
add v30.4s, v30.4s, v31.4s //CTR block 5
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 0
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 6
|
|
add v30.4s, v30.4s, v31.4s //CTR block 6
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 7
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 0
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 0
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 1
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 1
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 1
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 1
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 1
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 1
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 1
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 2
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 2
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 2
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 2
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 2
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 3
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 3
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 3
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 3
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 3
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 3
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 3
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 3
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 4
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 4
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 4
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 4
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 4
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 4
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 5
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 5
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 5
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 5
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 5
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 6
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 6
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 6
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 7
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 7
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 7
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 7
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 7
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 7
|
|
|
|
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 8
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 8
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 8
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 8
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 8
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 8
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 9
|
|
|
|
ld1 { v19.16b}, [x3]
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
add x4, x0, x1, lsr #3 //end_input_ptr
|
|
add x5, x5, x0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 9
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 9
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 9
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 9
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 9
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 9
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 10
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 10
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 10
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 7
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 11
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 11
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 11
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 11
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 11
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 11
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 11
|
|
ldr q28, [x11, #224] //load rk14
|
|
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 1 - round 12
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 4 - round 12
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 5 - round 12
|
|
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 3 - round 12
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 2 - round 12
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 6 - round 12
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 0 - round 12
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 7 - round 12
|
|
|
|
aese v5.16b, v27.16b //AES block 5 - round 13
|
|
aese v1.16b, v27.16b //AES block 1 - round 13
|
|
aese v2.16b, v27.16b //AES block 2 - round 13
|
|
|
|
aese v0.16b, v27.16b //AES block 0 - round 13
|
|
aese v4.16b, v27.16b //AES block 4 - round 13
|
|
aese v6.16b, v27.16b //AES block 6 - round 13
|
|
|
|
aese v3.16b, v27.16b //AES block 3 - round 13
|
|
aese v7.16b, v27.16b //AES block 7 - round 13
|
|
b.ge .L256_dec_tail //handle tail
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
|
|
cmp x0, x5 //check if we have <= 8 blocks
|
|
|
|
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
|
|
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
|
|
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
|
|
|
|
rev32 v0.16b, v30.16b //CTR block 8
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8
|
|
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
|
|
|
|
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
|
|
|
|
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
|
|
rev32 v1.16b, v30.16b //CTR block 9
|
|
add v30.4s, v30.4s, v31.4s //CTR block 9
|
|
|
|
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
|
|
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
|
|
|
|
rev32 v2.16b, v30.16b //CTR block 10
|
|
add v30.4s, v30.4s, v31.4s //CTR block 10
|
|
|
|
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
|
|
|
|
rev32 v3.16b, v30.16b //CTR block 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 11
|
|
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
|
|
|
|
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
|
|
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
|
|
|
|
rev32 v4.16b, v30.16b //CTR block 12
|
|
add v30.4s, v30.4s, v31.4s //CTR block 12
|
|
b.ge .L256_dec_prepretail //do prepretail
|
|
|
|
.L256_dec_main_loop: //main loop start
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
|
|
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
rev32 v20.16b, v30.16b //CTR block 8k+16
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
|
|
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
rev32 v22.16b, v30.16b //CTR block 8k+17
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
|
|
|
|
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
|
|
|
|
rev32 v23.16b, v30.16b //CTR block 8k+18
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
|
|
|
|
ldr q28, [x11, #224] //load rk14
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
|
|
|
|
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
|
|
aese v1.16b, v27.16b //AES block 8k+9 - round 13
|
|
aese v2.16b, v27.16b //AES block 8k+10 - round 13
|
|
|
|
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
|
|
aese v0.16b, v27.16b //AES block 8k+8 - round 13
|
|
aese v5.16b, v27.16b //AES block 8k+13 - round 13
|
|
|
|
rev32 v25.16b, v30.16b //CTR block 8k+19
|
|
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
|
|
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
|
|
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
aese v7.16b, v27.16b //AES block 8k+15 - round 13
|
|
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v4.16b, v27.16b //AES block 8k+12 - round 13
|
|
|
|
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result
|
|
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
|
|
aese v3.16b, v27.16b //AES block 8k+11 - round 13
|
|
|
|
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
|
|
mov v0.16b, v20.16b //CTR block 8k+16
|
|
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result
|
|
|
|
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
|
|
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
|
|
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
|
|
|
|
mov v3.16b, v25.16b //CTR block 8k+19
|
|
mov v2.16b, v23.16b //CTR block 8k+18
|
|
aese v6.16b, v27.16b //AES block 8k+14 - round 13
|
|
|
|
mov v1.16b, v22.16b //CTR block 8k+17
|
|
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
|
|
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result
|
|
|
|
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result
|
|
rev32 v4.16b, v30.16b //CTR block 8k+20
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
|
|
|
|
cmp x0, x5 //.LOOP CONTROL
|
|
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
|
|
b.lt .L256_dec_main_loop
|
|
|
|
.L256_dec_prepretail: //PREPRETAIL
|
|
ldp q26, q27, [x11, #0] //load rk0, rk1
|
|
rev32 v5.16b, v30.16b //CTR block 8k+13
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
|
|
|
|
rev64 v12.16b, v12.16b //GHASH block 8k+4
|
|
ldr q21, [x6, #112] //load h6k | h5k
|
|
ldr q24, [x6, #160] //load h8k | h7k
|
|
|
|
rev32 v6.16b, v30.16b //CTR block 8k+14
|
|
rev64 v8.16b, v8.16b //GHASH block 8k
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
|
|
|
|
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
|
|
ldr q23, [x6, #144] //load h7l | h7h
|
|
ldr q25, [x6, #176] //load h8l | h8h
|
|
rev64 v9.16b, v9.16b //GHASH block 8k+1
|
|
|
|
rev32 v7.16b, v30.16b //CTR block 8k+15
|
|
rev64 v10.16b, v10.16b //GHASH block 8k+2
|
|
ldr q20, [x6, #96] //load h5l | h5h
|
|
ldr q22, [x6, #128] //load h6l | h6h
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
|
|
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
|
|
|
|
ldp q28, q26, [x11, #32] //load rk2, rk3
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
|
|
eor v8.16b, v8.16b, v19.16b //PRE 1
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
|
|
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
|
|
|
|
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
|
|
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
|
|
|
|
rev64 v11.16b, v11.16b //GHASH block 8k+3
|
|
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
|
|
rev64 v14.16b, v14.16b //GHASH block 8k+6
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
|
|
|
|
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
|
|
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
|
|
|
|
ldp q27, q28, [x11, #64] //load rk4, rk5
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
|
|
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
|
|
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
|
|
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
|
|
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
|
|
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
|
|
|
|
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
|
|
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
|
|
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
|
|
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
|
|
|
|
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
|
|
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
|
|
ldr q20, [x6] //load h1l | h1h
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
|
|
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
|
|
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
|
|
|
|
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
|
|
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
|
|
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
|
|
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
|
|
ldp q26, q27, [x11, #96] //load rk6, rk7
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
rev64 v15.16b, v15.16b //GHASH block 8k+7
|
|
rev64 v13.16b, v13.16b //GHASH block 8k+5
|
|
|
|
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
|
|
|
|
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
|
|
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
|
|
|
|
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
|
|
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
|
|
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
|
|
|
|
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
|
|
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
|
|
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
|
|
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
|
|
|
|
ldp q28, q26, [x11, #128] //load rk8, rk9
|
|
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
|
|
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
|
|
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
|
|
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
|
|
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
|
|
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
|
|
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
|
|
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
|
|
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
|
|
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
|
|
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
|
|
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
|
|
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
|
|
|
|
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
|
|
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
|
|
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
|
|
|
|
ldp q27, q28, [x11, #160] //load rk10, rk11
|
|
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
|
|
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
|
|
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
|
|
|
|
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
|
|
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
|
|
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
|
|
|
|
aese v4.16b, v27.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
|
|
aese v6.16b, v27.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
|
|
aese v5.16b, v27.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
|
|
|
|
aese v0.16b, v27.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
|
|
aese v2.16b, v27.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
|
|
aese v3.16b, v27.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
|
|
|
|
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
aese v7.16b, v27.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
|
|
aese v1.16b, v27.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
|
|
ldp q26, q27, [x11, #192] //load rk12, rk13
|
|
|
|
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
|
|
aese v2.16b, v28.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
|
|
aese v1.16b, v28.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
|
|
aese v0.16b, v28.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
|
|
|
|
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
aese v3.16b, v28.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
|
|
|
|
aese v7.16b, v28.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
|
|
aese v6.16b, v28.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
|
|
aese v4.16b, v28.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
|
|
|
|
aese v5.16b, v28.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
|
|
aese v3.16b, v26.16b
|
|
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
|
|
|
|
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
|
|
|
|
aese v3.16b, v27.16b //AES block 8k+11 - round 13
|
|
aese v2.16b, v26.16b
|
|
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
|
|
aese v6.16b, v26.16b
|
|
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
aese v4.16b, v26.16b
|
|
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
|
|
aese v7.16b, v26.16b
|
|
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
|
|
|
|
aese v0.16b, v26.16b
|
|
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
|
|
ldr q28, [x11, #224] //load rk14
|
|
aese v1.16b, v26.16b
|
|
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
|
|
|
|
aese v4.16b, v27.16b //AES block 8k+12 - round 13
|
|
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
aese v5.16b, v26.16b
|
|
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
|
|
|
|
aese v6.16b, v27.16b //AES block 8k+14 - round 13
|
|
aese v2.16b, v27.16b //AES block 8k+10 - round 13
|
|
aese v1.16b, v27.16b //AES block 8k+9 - round 13
|
|
|
|
aese v5.16b, v27.16b //AES block 8k+13 - round 13
|
|
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
|
|
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
|
|
|
|
aese v7.16b, v27.16b //AES block 8k+15 - round 13
|
|
aese v0.16b, v27.16b //AES block 8k+8 - round 13
|
|
.L256_dec_tail: //TAIL
|
|
|
|
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
|
|
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
|
|
cmp x5, #112
|
|
|
|
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
|
|
|
|
ldp q24, q25, [x6, #160] //load h8k | h7k
|
|
mov v29.16b, v28.16b
|
|
|
|
ldp q20, q21, [x6, #96] //load h5l | h5h
|
|
|
|
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
|
|
ldp q22, q23, [x6, #128] //load h6l | h6h
|
|
b.gt .L256_dec_blocks_more_than_7
|
|
|
|
mov v7.16b, v6.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v3.16b
|
|
movi v19.8b, #0
|
|
|
|
movi v17.8b, #0
|
|
movi v18.8b, #0
|
|
mov v3.16b, v2.16b
|
|
|
|
cmp x5, #96
|
|
mov v2.16b, v1.16b
|
|
b.gt .L256_dec_blocks_more_than_6
|
|
|
|
mov v7.16b, v6.16b
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v4.16b
|
|
cmp x5, #80
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v4.16b, v3.16b
|
|
mov v3.16b, v1.16b
|
|
b.gt .L256_dec_blocks_more_than_5
|
|
|
|
cmp x5, #64
|
|
mov v7.16b, v6.16b
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v6.16b, v5.16b
|
|
|
|
mov v5.16b, v4.16b
|
|
mov v4.16b, v1.16b
|
|
b.gt .L256_dec_blocks_more_than_4
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
cmp x5, #48
|
|
|
|
mov v6.16b, v5.16b
|
|
mov v5.16b, v1.16b
|
|
b.gt .L256_dec_blocks_more_than_3
|
|
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
sub v30.4s, v30.4s, v31.4s
|
|
mov v7.16b, v6.16b
|
|
|
|
cmp x5, #32
|
|
mov v6.16b, v1.16b
|
|
b.gt .L256_dec_blocks_more_than_2
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
|
|
mov v7.16b, v1.16b
|
|
cmp x5, #16
|
|
b.gt .L256_dec_blocks_more_than_1
|
|
|
|
sub v30.4s, v30.4s, v31.4s
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
b .L256_dec_blocks_less_than_1
|
|
.L256_dec_blocks_more_than_7: //blocks left > 7
|
|
rev64 v8.16b, v9.16b //GHASH final-7 block
|
|
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
|
|
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
|
|
|
|
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
|
|
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
|
|
|
|
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
|
|
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
|
|
.L256_dec_blocks_more_than_6: //blocks left > 6
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-6 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
|
|
|
|
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
|
|
.L256_dec_blocks_more_than_5: //blocks left > 5
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-5 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
|
|
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
|
|
|
|
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
|
|
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
|
|
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
|
|
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
.L256_dec_blocks_more_than_4: //blocks left > 4
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-4 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
|
|
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
|
|
|
|
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
|
|
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
|
|
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
|
|
.L256_dec_blocks_more_than_3: //blocks left > 3
|
|
|
|
ldr q25, [x6, #80] //load h4l | h4h
|
|
rev64 v8.16b, v9.16b //GHASH final-3 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
|
|
ldr q24, [x6, #64] //load h4k | h3k
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
|
|
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
|
|
|
|
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
|
|
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
|
|
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
|
|
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
|
|
.L256_dec_blocks_more_than_2: //blocks left > 2
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-2 block
|
|
|
|
ldr q23, [x6, #48] //load h3l | h3h
|
|
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
|
|
|
|
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
|
|
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
|
|
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
|
|
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
|
|
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
|
|
.L256_dec_blocks_more_than_1: //blocks left > 1
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final-1 block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
|
|
ldr q22, [x6, #32] //load h2l | h2h
|
|
|
|
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
|
|
ldr q9, [x0], #16 //AES final block - load ciphertext
|
|
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
|
|
|
|
ldr q21, [x6, #16] //load h2k | h1k
|
|
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
|
|
|
|
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
|
|
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
|
|
|
|
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
|
|
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
|
|
|
|
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
|
|
|
|
movi v16.8b, #0 //supress further partial tag feed in
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
|
|
|
|
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
|
|
.L256_dec_blocks_less_than_1: //blocks left <= 1
|
|
|
|
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
|
|
mvn x7, xzr //temp0_x = 0xffffffffffffffff
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
sub x1, x1, #128 //bit_length -= 128
|
|
rev32 v30.16b, v30.16b
|
|
str q30, [x16] //store the updated counter
|
|
|
|
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
|
|
|
|
and x1, x1, #127 //bit_length %= 128
|
|
|
|
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
|
|
cmp x1, #64
|
|
mvn x8, xzr //temp1_x = 0xffffffffffffffff
|
|
|
|
csel x14, x7, xzr, lt
|
|
csel x13, x8, x7, lt
|
|
|
|
mov v0.d[0], x13 //ctr0b is mask for last block
|
|
mov v0.d[1], x14
|
|
|
|
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
|
|
ldr q20, [x6] //load h1l | h1h
|
|
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
|
|
|
|
rev64 v8.16b, v9.16b //GHASH final block
|
|
|
|
eor v8.16b, v8.16b, v16.16b //feed in partial tag
|
|
|
|
ins v16.d[0], v8.d[1] //GHASH final block - mid
|
|
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
|
|
|
|
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
|
|
|
|
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
|
|
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
|
|
|
|
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
|
|
|
|
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
|
|
ldr d16, [x10] //MODULO - load modulo constant
|
|
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
|
|
|
|
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
|
|
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
|
|
|
|
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
|
|
st1 { v12.16b}, [x2] //store all 16B
|
|
|
|
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
|
|
|
|
eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid
|
|
eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid
|
|
|
|
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
|
|
|
|
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
|
|
eor v19.16b, v19.16b, v17.16b //MODULO - fold into low
|
|
|
|
eor v19.16b, v19.16b, v18.16b //MODULO - fold into low
|
|
ext v19.16b, v19.16b, v19.16b, #8
|
|
rev64 v19.16b, v19.16b
|
|
st1 { v19.16b }, [x3]
|
|
mov x0, x9
|
|
|
|
ldp d10, d11, [sp, #16]
|
|
ldp d12, d13, [sp, #32]
|
|
ldp d14, d15, [sp, #48]
|
|
ldp d8, d9, [sp], #80
|
|
ret
|
|
|
|
.L256_dec_ret:
|
|
mov w0, #0x0
|
|
ret
|
|
.size aesv8_gcm_8x_dec_256,.-aesv8_gcm_8x_dec_256
|
|
.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
|
|
.align 2
|
|
.align 2
|
|
#endif
|
|
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)
|