Files
cli/vendor/aws-lc-sys/aws-lc/generated-src/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-unroll8.S

8364 lines
280 KiB
ArmAsm

// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#include <openssl/asm_base.h>
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__)
#include "openssl/arm_arch.h"
#if __ARM_MAX_ARCH__>=8
.text
.arch armv8.2-a+crypto
.globl aesv8_gcm_8x_enc_128
.hidden aesv8_gcm_8x_enc_128
.type aesv8_gcm_8x_enc_128,%function
.align 4
aesv8_gcm_8x_enc_128:
#ifdef BORINGSSL_DISPATCH_TEST
adrp x9,BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#7] // kFlag_aesv8_gcm_8x_enc_128
#endif
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_enc_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
mov x5, x9
ld1 { v0.16b}, [x16] //CTR block 0
sub x5, x5, #1 //byte_len - 1
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 v30.16b, v0.16b //set up reversed counter
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
rev32 v7.16b, v30.16b //CTR block 7
add v30.4s, v30.4s, v31.4s //CTR block 7
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
ldr q27, [x11, #160] //load rk10
aese v3.16b, v26.16b //AES block 8k+11 - round 9
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v2.16b, v26.16b //AES block 8k+10 - round 9
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v6.16b, v26.16b //AES block 8k+14 - round 9
aese v4.16b, v26.16b //AES block 8k+12 - round 9
add x5, x5, x0
aese v0.16b, v26.16b //AES block 8k+8 - round 9
aese v7.16b, v26.16b //AES block 8k+15 - round 9
aese v5.16b, v26.16b //AES block 8k+13 - round 9
aese v1.16b, v26.16b //AES block 8k+9 - round 9
add x4, x0, x1, lsr #3 //end_input_ptr
cmp x0, x5 //check if we have <= 8 blocks
b.ge .L128_enc_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
cmp x0, x5 //check if we have <= 8 blocks
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
rev32 v1.16b, v30.16b //CTR block 9
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
add v30.4s, v30.4s, v31.4s //CTR block 9
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
rev32 v4.16b, v30.16b //CTR block 12
add v30.4s, v30.4s, v31.4s //CTR block 12
b.ge .L128_enc_prepretail //do prepretail
.L128_enc_main_loop: //main loop start
rev32 v5.16b, v30.16b //CTR block 8k+13
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v9.16b, v9.16b //GHASH block 8k+1
rev64 v8.16b, v8.16b //GHASH block 8k
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
rev64 v11.16b, v11.16b //GHASH block 8k+3
ldp q26, q27, [x11, #0] //load rk0, rk1
eor v8.16b, v8.16b, v19.16b //PRE 1
rev32 v7.16b, v30.16b //CTR block 8k+15
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
rev64 v10.16b, v10.16b //GHASH block 8k+2
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h3l | h3h
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h1l | h1h
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
ldp q26, q27, [x11, #96] //load rk6, rk7
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
ldr d16, [x10] //MODULO - load modulo constant
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
rev32 v20.16b, v30.16b //CTR block 8k+16
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
ldp q28, q26, [x11, #128] //load rk8, rk9
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
rev32 v22.16b, v30.16b //CTR block 8k+17
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ldr q27, [x11, #160] //load rk10
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
rev32 v23.16b, v30.16b //CTR block 8k+18
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v2.16b, v26.16b //AES block 8k+10 - round 9
aese v4.16b, v26.16b //AES block 8k+12 - round 9
aese v1.16b, v26.16b //AES block 8k+9 - round 9
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
rev32 v25.16b, v30.16b //CTR block 8k+19
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
cmp x0, x5 //.LOOP CONTROL
.inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
aese v7.16b, v26.16b //AES block 8k+15 - round 9
aese v6.16b, v26.16b //AES block 8k+14 - round 9
aese v3.16b, v26.16b //AES block 8k+11 - round 9
.inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
mov v2.16b, v23.16b //CTR block 8k+18
aese v0.16b, v26.16b //AES block 8k+8 - round 9
rev32 v4.16b, v30.16b //CTR block 8k+20
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
.inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
aese v5.16b, v26.16b //AES block 8k+13 - round 9
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
.inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
.inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
mov v3.16b, v25.16b //CTR block 8k+19
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
mov v1.16b, v22.16b //CTR block 8k+17
.inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
mov v0.16b, v20.16b //CTR block 8k+16
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
.inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
b.lt .L128_enc_main_loop
.L128_enc_prepretail: //PREPRETAIL
rev32 v5.16b, v30.16b //CTR block 8k+13
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
rev64 v8.16b, v8.16b //GHASH block 8k
rev64 v9.16b, v9.16b //GHASH block 8k+1
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h6k | h5k
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev64 v10.16b, v10.16b //GHASH block 8k+2
eor v8.16b, v8.16b, v19.16b //PRE 1
rev32 v6.16b, v30.16b //CTR block 8k+14
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
ldp q26, q27, [x11, #0] //load rk0, rk1
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
rev32 v7.16b, v30.16b //CTR block 8k+15
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h1l | h1h
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
ldp q26, q27, [x11, #96] //load rk6, rk7
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
ldr d16, [x10] //MODULO - load modulo constant
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
.inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
ldr q27, [x11, #160] //load rk10
aese v6.16b, v26.16b //AES block 8k+14 - round 9
aese v2.16b, v26.16b //AES block 8k+10 - round 9
aese v0.16b, v26.16b //AES block 8k+8 - round 9
aese v1.16b, v26.16b //AES block 8k+9 - round 9
aese v3.16b, v26.16b //AES block 8k+11 - round 9
aese v5.16b, v26.16b //AES block 8k+13 - round 9
aese v4.16b, v26.16b //AES block 8k+12 - round 9
aese v7.16b, v26.16b //AES block 8k+15 - round 9
.L128_enc_tail: //TAIL
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
mov v29.16b, v27.16b
ldp q20, q21, [x6, #96] //load h5l | h5h
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
ldp q22, q23, [x6, #128] //load h6l | h6h
ldp q24, q25, [x6, #160] //load h8k | h7k
cmp x5, #112
b.gt .L128_enc_blocks_more_than_7
mov v7.16b, v6.16b
mov v6.16b, v5.16b
movi v17.8b, #0
cmp x5, #96
sub v30.4s, v30.4s, v31.4s
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v2.16b
mov v2.16b, v1.16b
movi v19.8b, #0
movi v18.8b, #0
b.gt .L128_enc_blocks_more_than_6
mov v7.16b, v6.16b
cmp x5, #80
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
b.gt .L128_enc_blocks_more_than_5
cmp x5, #64
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v1.16b
b.gt .L128_enc_blocks_more_than_4
mov v7.16b, v6.16b
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v1.16b
cmp x5, #48
b.gt .L128_enc_blocks_more_than_3
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v1.16b
cmp x5, #32
ldr q24, [x6, #64] //load h4k | h3k
b.gt .L128_enc_blocks_more_than_2
cmp x5, #16
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v1.16b
b.gt .L128_enc_blocks_more_than_1
ldr q21, [x6, #16] //load h2k | h1k
sub v30.4s, v30.4s, v31.4s
b .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_7: //blocks left > 7
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
rev64 v8.16b, v9.16b //GHASH final-7 block
ldr q9, [x0], #16 //AES final-6 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
.L128_enc_blocks_more_than_6: //blocks left > 6
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
rev64 v8.16b, v9.16b //GHASH final-6 block
ldr q9, [x0], #16 //AES final-5 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
.L128_enc_blocks_more_than_5: //blocks left > 5
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
rev64 v8.16b, v9.16b //GHASH final-5 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
ldr q9, [x0], #16 //AES final-4 block - load plaintext
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
.L128_enc_blocks_more_than_4: //blocks left > 4
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
rev64 v8.16b, v9.16b //GHASH final-4 block
ldr q9, [x0], #16 //AES final-3 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
.L128_enc_blocks_more_than_3: //blocks left > 3
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
ldr q25, [x6, #80] //load h4l | h4h
rev64 v8.16b, v9.16b //GHASH final-3 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
ldr q24, [x6, #64] //load h4k | h3k
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
ldr q9, [x0], #16 //AES final-2 block - load plaintext
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
.L128_enc_blocks_more_than_2: //blocks left > 2
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
rev64 v8.16b, v9.16b //GHASH final-2 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-1 block - load plaintext
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
ldr q23, [x6, #48] //load h3l | h3h
movi v16.8b, #0 //supress further partial tag feed in
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
.L128_enc_blocks_more_than_1: //blocks left > 1
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
ldr q22, [x6, #32] //load h2l | h2h
rev64 v8.16b, v9.16b //GHASH final-1 block
ldr q9, [x0], #16 //AES final block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
ldr q21, [x6, #16] //load h2k | h1k
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
.L128_enc_blocks_less_than_1: //blocks left <= 1
rev32 v30.16b, v30.16b
str q30, [x16] //store the updated counter
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
mvn x7, xzr //temp0_x = 0xffffffffffffffff
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
and x1, x1, #127 //bit_length %= 128
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
mvn x8, xzr //temp1_x = 0xffffffffffffffff
cmp x1, #64
csel x13, x8, x7, lt
csel x14, x7, xzr, lt
mov v0.d[1], x14
mov v0.d[0], x13 //ctr0b is mask for last block
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
rev64 v8.16b, v9.16b //GHASH final block
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
st1 { v9.16b}, [x2] //store all 16B
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v16.d[0], v8.d[1] //GHASH final block - mid
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
ldr q20, [x6] //load h1l | h1h
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
mov x0, x9
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L128_enc_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_enc_128,.-aesv8_gcm_8x_enc_128
.globl aesv8_gcm_8x_dec_128
.hidden aesv8_gcm_8x_dec_128
.type aesv8_gcm_8x_dec_128,%function
.align 4
aesv8_gcm_8x_dec_128:
AARCH64_VALID_CALL_TARGET
cbz x1, .L128_dec_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
mov x5, x9
ld1 { v0.16b}, [x16] //CTR block 0
ldp q26, q27, [x11, #0] //load rk0, rk1
sub x5, x5, #1 //byte_len - 1
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
rev32 v30.16b, v0.16b //set up reversed counter
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
rev32 v7.16b, v30.16b //CTR block 7
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
add x5, x5, x0
add v30.4s, v30.4s, v31.4s //CTR block 7
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 8
aese v0.16b, v26.16b //AES block 0 - round 9
aese v1.16b, v26.16b //AES block 1 - round 9
aese v6.16b, v26.16b //AES block 6 - round 9
ldr q27, [x11, #160] //load rk10
aese v4.16b, v26.16b //AES block 4 - round 9
aese v3.16b, v26.16b //AES block 3 - round 9
aese v2.16b, v26.16b //AES block 2 - round 9
aese v5.16b, v26.16b //AES block 5 - round 9
aese v7.16b, v26.16b //AES block 7 - round 9
add x4, x0, x1, lsr #3 //end_input_ptr
cmp x0, x5 //check if we have <= 8 blocks
b.ge .L128_dec_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
rev32 v1.16b, v30.16b //CTR block 9
add v30.4s, v30.4s, v31.4s //CTR block 9
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
rev32 v4.16b, v30.16b //CTR block 12
cmp x0, x5 //check if we have <= 8 blocks
add v30.4s, v30.4s, v31.4s //CTR block 12
b.ge .L128_dec_prepretail //do prepretail
.L128_dec_main_loop: //main loop start
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev64 v9.16b, v9.16b //GHASH block 8k+1
rev64 v8.16b, v8.16b //GHASH block 8k
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v14.16b, v14.16b //GHASH block 8k+6
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
eor v8.16b, v8.16b, v19.16b //PRE 1
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v10.16b, v10.16b //GHASH block 8k+2
rev64 v12.16b, v12.16b //GHASH block 8k+4
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev32 v7.16b, v30.16b //CTR block 8k+15
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
rev64 v13.16b, v13.16b //GHASH block 8k+5
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
ldp q28, q26, [x11, #32] //load rk2, rk3
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
rev64 v15.16b, v15.16b //GHASH block 8k+7
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
ldp q27, q28, [x11, #64] //load rk4, rk5
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
ldr d16, [x10] //MODULO - load modulo constant
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
rev32 v20.16b, v30.16b //CTR block 8k+16
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
rev32 v22.16b, v30.16b //CTR block 8k+17
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
rev32 v23.16b, v30.16b //CTR block 8k+18
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v0.16b, v26.16b //AES block 8k+8 - round 9
aese v1.16b, v26.16b //AES block 8k+9 - round 9
ldr q27, [x11, #160] //load rk10
aese v6.16b, v26.16b //AES block 8k+14 - round 9
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v2.16b, v26.16b //AES block 8k+10 - round 9
aese v7.16b, v26.16b //AES block 8k+15 - round 9
aese v4.16b, v26.16b //AES block 8k+12 - round 9
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
rev32 v25.16b, v30.16b //CTR block 8k+19
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
aese v3.16b, v26.16b //AES block 8k+11 - round 9
aese v5.16b, v26.16b //AES block 8k+13 - round 9
.inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result
.inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result
.inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result
.inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result
.inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
mov v1.16b, v22.16b //CTR block 8k+17
.inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
mov v0.16b, v20.16b //CTR block 8k+16
.inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result
cmp x0, x5 //.LOOP CONTROL
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
.inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result
mov v2.16b, v23.16b //CTR block 8k+18
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
rev32 v4.16b, v30.16b //CTR block 8k+20
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
mov v3.16b, v25.16b //CTR block 8k+19
b.lt .L128_dec_main_loop
.L128_dec_prepretail: //PREPRETAIL
rev64 v11.16b, v11.16b //GHASH block 8k+3
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v8.16b, v8.16b //GHASH block 8k
rev64 v10.16b, v10.16b //GHASH block 8k+2
rev32 v5.16b, v30.16b //CTR block 8k+13
ldp q26, q27, [x11, #0] //load rk0, rk1
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
eor v8.16b, v8.16b, v19.16b //PRE 1
rev64 v9.16b, v9.16b //GHASH block 8k+1
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
rev64 v13.16b, v13.16b //GHASH block 8k+5
rev64 v12.16b, v12.16b //GHASH block 8k+4
rev64 v14.16b, v14.16b //GHASH block 8k+6
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
ldp q28, q26, [x11, #32] //load rk2, rk3
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
ldp q27, q28, [x11, #64] //load rk4, rk5
rev64 v15.16b, v15.16b //GHASH block 8k+7
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
ldp q26, q27, [x11, #96] //load rk6, rk7
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
ldr d16, [x10] //MODULO - load modulo constant
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ldp q28, q26, [x11, #128] //load rk8, rk9
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
ldr q27, [x11, #160] //load rk10
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v6.16b, v26.16b //AES block 8k+14 - round 9
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v2.16b, v26.16b //AES block 8k+10 - round 9
aese v3.16b, v26.16b //AES block 8k+11 - round 9
aese v5.16b, v26.16b //AES block 8k+13 - round 9
aese v0.16b, v26.16b //AES block 8k+8 - round 9
aese v4.16b, v26.16b //AES block 8k+12 - round 9
aese v1.16b, v26.16b //AES block 8k+9 - round 9
aese v7.16b, v26.16b //AES block 8k+15 - round 9
.L128_dec_tail: //TAIL
mov v29.16b, v27.16b
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
cmp x5, #112
ldp q24, q25, [x6, #160] //load h8k | h7k
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
ldp q20, q21, [x6, #96] //load h5l | h5h
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
ldp q22, q23, [x6, #128] //load h6l | h6h
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
b.gt .L128_dec_blocks_more_than_7
cmp x5, #96
mov v7.16b, v6.16b
movi v19.8b, #0
movi v17.8b, #0
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v2.16b
mov v2.16b, v1.16b
movi v18.8b, #0
sub v30.4s, v30.4s, v31.4s
b.gt .L128_dec_blocks_more_than_6
cmp x5, #80
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
b.gt .L128_dec_blocks_more_than_5
cmp x5, #64
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L128_dec_blocks_more_than_4
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v1.16b
cmp x5, #48
b.gt .L128_dec_blocks_more_than_3
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
cmp x5, #32
ldr q24, [x6, #64] //load h4k | h3k
mov v6.16b, v1.16b
b.gt .L128_dec_blocks_more_than_2
cmp x5, #16
mov v7.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L128_dec_blocks_more_than_1
sub v30.4s, v30.4s, v31.4s
ldr q21, [x6, #16] //load h2k | h1k
b .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_7: //blocks left > 7
rev64 v8.16b, v9.16b //GHASH final-7 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
.L128_dec_blocks_more_than_6: //blocks left > 6
rev64 v8.16b, v9.16b //GHASH final-6 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
movi v16.8b, #0 //supress further partial tag feed in
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
.L128_dec_blocks_more_than_5: //blocks left > 5
rev64 v8.16b, v9.16b //GHASH final-5 block
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
.L128_dec_blocks_more_than_4: //blocks left > 4
rev64 v8.16b, v9.16b //GHASH final-4 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
.L128_dec_blocks_more_than_3: //blocks left > 3
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
rev64 v8.16b, v9.16b //GHASH final-3 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
ldr q25, [x6, #80] //load h4l | h4h
ldr q24, [x6, #64] //load h4k | h3k
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
movi v16.8b, #0 //supress further partial tag feed in
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
.L128_dec_blocks_more_than_2: //blocks left > 2
rev64 v8.16b, v9.16b //GHASH final-2 block
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q23, [x6, #48] //load h3l | h3h
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
.L128_dec_blocks_more_than_1: //blocks left > 1
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
rev64 v8.16b, v9.16b //GHASH final-1 block
ldr q22, [x6, #32] //load h2l | h2h
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
ldr q9, [x0], #16 //AES final block - load ciphertext
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
ldr q21, [x6, #16] //load h2k | h1k
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: //blocks left <= 1
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
mvn x7, xzr //temp0_x = 0xffffffffffffffff
and x1, x1, #127 //bit_length %= 128
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
cmp x1, #64
mvn x8, xzr //temp1_x = 0xffffffffffffffff
csel x13, x8, x7, lt
csel x14, x7, xzr, lt
mov v0.d[1], x14
mov v0.d[0], x13 //ctr0b is mask for last block
ldr q20, [x6] //load h1l | h1h
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
rev64 v8.16b, v9.16b //GHASH final block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
ins v16.d[0], v8.d[1] //GHASH final block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
st1 { v12.16b}, [x2] //store all 16B
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
rev32 v30.16b, v30.16b
str q30, [x16] //store the updated counter
mov x0, x9
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L128_dec_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_dec_128,.-aesv8_gcm_8x_dec_128
.globl aesv8_gcm_8x_enc_192
.hidden aesv8_gcm_8x_enc_192
.type aesv8_gcm_8x_enc_192,%function
.align 4
aesv8_gcm_8x_enc_192:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_enc_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
mov x5, x9
ld1 { v0.16b}, [x16] //CTR block 0
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
rev32 v30.16b, v0.16b //set up reversed counter
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
sub x5, x5, #1 //byte_len - 1
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
ldp q26, q27, [x11, #0] //load rk0, rk1
add x5, x5, x0
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
rev32 v7.16b, v30.16b //CTR block 7
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
add v30.4s, v30.4s, v31.4s //CTR block 7
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 8
add x4, x0, x1, lsr #3 //end_input_ptr
cmp x0, x5 //check if we have <= 8 blocks
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 9
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 9
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 14 - round 10
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 9
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 11 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 9 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 13 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 12 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 10 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 15 - round 10
aese v6.16b, v28.16b //AES block 14 - round 11
aese v3.16b, v28.16b //AES block 11 - round 11
aese v4.16b, v28.16b //AES block 12 - round 11
aese v7.16b, v28.16b //AES block 15 - round 11
ldr q26, [x11, #192] //load rk12
aese v1.16b, v28.16b //AES block 9 - round 11
aese v5.16b, v28.16b //AES block 13 - round 11
aese v2.16b, v28.16b //AES block 10 - round 11
aese v0.16b, v28.16b //AES block 8 - round 11
b.ge .L192_enc_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
rev32 v1.16b, v30.16b //CTR block 9
add v30.4s, v30.4s, v31.4s //CTR block 9
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
cmp x0, x5 //check if we have <= 8 blocks
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
rev32 v4.16b, v30.16b //CTR block 12
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
add v30.4s, v30.4s, v31.4s //CTR block 12
b.ge .L192_enc_prepretail //do prepretail
.L192_enc_main_loop: //main loop start
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
ldp q26, q27, [x11, #0] //load rk0, rk1
rev64 v10.16b, v10.16b //GHASH block 8k+2
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v8.16b, v8.16b //GHASH block 8k
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
rev64 v9.16b, v9.16b //GHASH block 8k+1
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
eor v8.16b, v8.16b, v19.16b //PRE 1
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
ldp q28, q26, [x11, #128] //load rk8, rk9
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
ldr d16, [x10] //MODULO - load modulo constant
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
ldp q27, q28, [x11, #160] //load rk10, rk11
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
rev32 v20.16b, v30.16b //CTR block 8k+16
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
rev32 v22.16b, v30.16b //CTR block 8k+17
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
ldr q26, [x11, #192] //load rk12
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
aese v4.16b, v28.16b //AES block 8k+12 - round 11
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext
aese v2.16b, v28.16b //AES block 8k+10 - round 11
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
rev32 v23.16b, v30.16b //CTR block 8k+18
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
aese v5.16b, v28.16b //AES block 8k+13 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
aese v7.16b, v28.16b //AES block 8k+15 - round 11
aese v0.16b, v28.16b //AES block 8k+8 - round 11
.inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
aese v6.16b, v28.16b //AES block 8k+14 - round 11
aese v3.16b, v28.16b //AES block 8k+11 - round 11
aese v1.16b, v28.16b //AES block 8k+9 - round 11
rev32 v25.16b, v30.16b //CTR block 8k+19
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
.inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
.inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
.inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
mov v2.16b, v23.16b //CTR block 8k+18
.inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
mov v1.16b, v22.16b //CTR block 8k+17
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
mov v0.16b, v20.16b //CTR block 8k+16
rev32 v4.16b, v30.16b //CTR block 8k+20
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
.inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
.inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
mov v3.16b, v25.16b //CTR block 8k+19
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result
cmp x0, x5 //.LOOP CONTROL
stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result
b.lt .L192_enc_main_loop
.L192_enc_prepretail: //PREPRETAIL
rev32 v5.16b, v30.16b //CTR block 8k+13
ldp q26, q27, [x11, #0] //load rk0, rk1
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev64 v8.16b, v8.16b //GHASH block 8k
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev64 v10.16b, v10.16b //GHASH block 8k+2
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
eor v8.16b, v8.16b, v19.16b //PRE 1
rev32 v7.16b, v30.16b //CTR block 8k+15
rev64 v9.16b, v9.16b //GHASH block 8k+1
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free)
rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free)
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free)
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free)
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
ldr d16, [x10] //MODULO - load modulo constant
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ldr q26, [x11, #192] //load rk12
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v1.16b, v28.16b //AES block 8k+9 - round 11
aese v7.16b, v28.16b //AES block 8k+15 - round 11
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v3.16b, v28.16b //AES block 8k+11 - round 11
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v2.16b, v28.16b //AES block 8k+10 - round 11
aese v0.16b, v28.16b //AES block 8k+8 - round 11
aese v6.16b, v28.16b //AES block 8k+14 - round 11
aese v4.16b, v28.16b //AES block 8k+12 - round 11
aese v5.16b, v28.16b //AES block 8k+13 - round 11
.L192_enc_tail: //TAIL
ldp q20, q21, [x6, #96] //load h5l | h5h
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext
ldp q24, q25, [x6, #160] //load h8k | h7k
mov v29.16b, v26.16b
ldp q22, q23, [x6, #128] //load h6l | h6h
cmp x5, #112
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
b.gt .L192_enc_blocks_more_than_7
cmp x5, #96
mov v7.16b, v6.16b
movi v17.8b, #0
mov v6.16b, v5.16b
movi v19.8b, #0
sub v30.4s, v30.4s, v31.4s
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v2.16b
mov v2.16b, v1.16b
movi v18.8b, #0
b.gt .L192_enc_blocks_more_than_6
mov v7.16b, v6.16b
cmp x5, #80
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L192_enc_blocks_more_than_5
cmp x5, #64
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v1.16b
b.gt .L192_enc_blocks_more_than_4
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
cmp x5, #48
b.gt .L192_enc_blocks_more_than_3
mov v7.16b, v6.16b
mov v6.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
ldr q24, [x6, #64] //load h4k | h3k
cmp x5, #32
b.gt .L192_enc_blocks_more_than_2
sub v30.4s, v30.4s, v31.4s
cmp x5, #16
mov v7.16b, v1.16b
b.gt .L192_enc_blocks_more_than_1
sub v30.4s, v30.4s, v31.4s
ldr q21, [x6, #16] //load h2k | h1k
b .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_7: //blocks left > 7
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
rev64 v8.16b, v9.16b //GHASH final-7 block
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
ldr q9, [x0], #16 //AES final-6 block - load plaintext
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
.L192_enc_blocks_more_than_6: //blocks left > 6
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
rev64 v8.16b, v9.16b //GHASH final-6 block
ldr q9, [x0], #16 //AES final-5 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
.L192_enc_blocks_more_than_5: //blocks left > 5
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
rev64 v8.16b, v9.16b //GHASH final-5 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
ldr q9, [x0], #16 //AES final-4 block - load plaintext
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
movi v16.8b, #0 //supress further partial tag feed in
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
.L192_enc_blocks_more_than_4: //blocks left > 4
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
rev64 v8.16b, v9.16b //GHASH final-4 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-3 block - load plaintext
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
.L192_enc_blocks_more_than_3: //blocks left > 3
ldr q24, [x6, #64] //load h4k | h3k
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
rev64 v8.16b, v9.16b //GHASH final-3 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ldr q9, [x0], #16 //AES final-2 block - load plaintext
ldr q25, [x6, #80] //load h4l | h4h
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
.L192_enc_blocks_more_than_2: //blocks left > 2
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
rev64 v8.16b, v9.16b //GHASH final-2 block
ldr q23, [x6, #48] //load h3l | h3h
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-1 block - load plaintext
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
movi v16.8b, #0 //supress further partial tag feed in
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
.L192_enc_blocks_more_than_1: //blocks left > 1
ldr q22, [x6, #32] //load h1l | h1h
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
rev64 v8.16b, v9.16b //GHASH final-1 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
ldr q9, [x0], #16 //AES final block - load plaintext
ldr q21, [x6, #16] //load h2k | h1k
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
.L192_enc_blocks_less_than_1: //blocks left <= 1
mvn x7, xzr //temp0_x = 0xffffffffffffffff
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
and x1, x1, #127 //bit_length %= 128
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
cmp x1, #64
mvn x8, xzr //temp1_x = 0xffffffffffffffff
csel x13, x8, x7, lt
csel x14, x7, xzr, lt
mov v0.d[1], x14
ldr q20, [x6] //load h1l | h1h
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
mov v0.d[0], x13 //ctr0b is mask for last block
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
rev64 v8.16b, v9.16b //GHASH final block
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
st1 { v9.16b}, [x2] //store all 16B
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v16.d[0], v8.d[1] //GHASH final block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
rev32 v30.16b, v30.16b
str q30, [x16] //store the updated counter
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
mov x0, x9 //return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L192_enc_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_enc_192,.-aesv8_gcm_8x_enc_192
.globl aesv8_gcm_8x_dec_192
.hidden aesv8_gcm_8x_dec_192
.type aesv8_gcm_8x_dec_192,%function
.align 4
aesv8_gcm_8x_dec_192:
AARCH64_VALID_CALL_TARGET
cbz x1, .L192_dec_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
mov x5, x9
ld1 { v0.16b}, [x16] //CTR block 0
ld1 { v19.16b}, [x3]
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
rev32 v30.16b, v0.16b //set up reversed counter
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
rev32 v7.16b, v30.16b //CTR block 7
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
sub x5, x5, #1 //byte_len - 1
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
add v30.4s, v30.4s, v31.4s //CTR block 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 8
add x4, x0, x1, lsr #3 //end_input_ptr
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 9
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 9
add x5, x5, x0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 9
cmp x0, x5 //check if we have <= 8 blocks
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 9
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 10
ldr q26, [x11, #192] //load rk12
aese v0.16b, v28.16b //AES block 0 - round 11
aese v1.16b, v28.16b //AES block 1 - round 11
aese v4.16b, v28.16b //AES block 4 - round 11
aese v6.16b, v28.16b //AES block 6 - round 11
aese v5.16b, v28.16b //AES block 5 - round 11
aese v7.16b, v28.16b //AES block 7 - round 11
aese v2.16b, v28.16b //AES block 2 - round 11
aese v3.16b, v28.16b //AES block 3 - round 11
b.ge .L192_dec_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
rev32 v1.16b, v30.16b //CTR block 9
add v30.4s, v30.4s, v31.4s //CTR block 9
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
cmp x0, x5 //check if we have <= 8 blocks
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result
rev32 v4.16b, v30.16b //CTR block 12
add v30.4s, v30.4s, v31.4s //CTR block 12
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
b.ge .L192_dec_prepretail //do prepretail
.L192_dec_main_loop: //main loop start
rev64 v9.16b, v9.16b //GHASH block 8k+1
ldp q26, q27, [x11, #0] //load rk0, rk1
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v8.16b, v8.16b //GHASH block 8k
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev64 v12.16b, v12.16b //GHASH block 8k+4
rev64 v11.16b, v11.16b //GHASH block 8k+3
eor v8.16b, v8.16b, v19.16b //PRE 1
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
rev64 v13.16b, v13.16b //GHASH block 8k+5
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
rev64 v10.16b, v10.16b //GHASH block 8k+2
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
rev64 v15.16b, v15.16b //GHASH block 8k+7
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
rev64 v14.16b, v14.16b //GHASH block 8k+6
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
ldr d16, [x10] //MODULO - load modulo constant
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
rev32 v20.16b, v30.16b //CTR block 8k+16
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
ldp q27, q28, [x11, #160] //load rk10, rk11
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
rev32 v22.16b, v30.16b //CTR block 8k+17
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
rev32 v23.16b, v30.16b //CTR block 8k+18
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
ldr q26, [x11, #192] //load rk12
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
aese v0.16b, v28.16b //AES block 8k+8 - round 11
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v1.16b, v28.16b //AES block 8k+9 - round 11
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
aese v6.16b, v28.16b //AES block 8k+14 - round 11
aese v3.16b, v28.16b //AES block 8k+11 - round 11
.inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result
rev32 v25.16b, v30.16b //CTR block 8k+19
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v4.16b, v28.16b //AES block 8k+12 - round 11
aese v2.16b, v28.16b //AES block 8k+10 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
aese v7.16b, v28.16b //AES block 8k+15 - round 11
aese v5.16b, v28.16b //AES block 8k+13 - round 11
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
.inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
.inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result
.inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result
.inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
.inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
mov v3.16b, v25.16b //CTR block 8k+19
.inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
cmp x0, x5 //.LOOP CONTROL
.inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
mov v0.16b, v20.16b //CTR block 8k+16
mov v1.16b, v22.16b //CTR block 8k+17
mov v2.16b, v23.16b //CTR block 8k+18
rev32 v4.16b, v30.16b //CTR block 8k+20
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
b.lt .L192_dec_main_loop
.L192_dec_prepretail: //PREPRETAIL
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev64 v8.16b, v8.16b //GHASH block 8k
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
eor v8.16b, v8.16b, v19.16b //PRE 1
rev64 v10.16b, v10.16b //GHASH block 8k+2
rev64 v9.16b, v9.16b //GHASH block 8k+1
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
rev64 v13.16b, v13.16b //GHASH block 8k+5
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
rev64 v15.16b, v15.16b //GHASH block 8k+7
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
rev64 v12.16b, v12.16b //GHASH block 8k+4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
rev64 v14.16b, v14.16b //GHASH block 8k+6
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
ldp q28, q26, [x11, #128] //load rk8, rk9
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
ldr d16, [x10] //MODULO - load modulo constant
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
ldp q27, q28, [x11, #160] //load rk10, rk11
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ldr q26, [x11, #192] //load rk12
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v0.16b, v28.16b //AES block 8k+8 - round 11
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
aese v5.16b, v28.16b //AES block 8k+13 - round 11
aese v2.16b, v28.16b //AES block 8k+10 - round 11
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
aese v6.16b, v28.16b //AES block 8k+14 - round 11
aese v4.16b, v28.16b //AES block 8k+12 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v3.16b, v28.16b //AES block 8k+11 - round 11
aese v1.16b, v28.16b //AES block 8k+9 - round 11
aese v7.16b, v28.16b //AES block 8k+15 - round 11
.L192_dec_tail: //TAIL
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldp q20, q21, [x6, #96] //load h5l | h5h
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
ldp q24, q25, [x6, #160] //load h8k | h7k
mov v29.16b, v26.16b
ldp q22, q23, [x6, #128] //load h6l | h6h
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
cmp x5, #112
b.gt .L192_dec_blocks_more_than_7
mov v7.16b, v6.16b
movi v17.8b, #0
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
cmp x5, #96
movi v19.8b, #0
mov v3.16b, v2.16b
mov v2.16b, v1.16b
movi v18.8b, #0
b.gt .L192_dec_blocks_more_than_6
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
cmp x5, #80
b.gt .L192_dec_blocks_more_than_5
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v1.16b
cmp x5, #64
sub v30.4s, v30.4s, v31.4s
b.gt .L192_dec_blocks_more_than_4
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v1.16b
cmp x5, #48
b.gt .L192_dec_blocks_more_than_3
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
cmp x5, #32
mov v6.16b, v1.16b
ldr q24, [x6, #64] //load h4k | h3k
b.gt .L192_dec_blocks_more_than_2
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v1.16b
cmp x5, #16
b.gt .L192_dec_blocks_more_than_1
sub v30.4s, v30.4s, v31.4s
ldr q21, [x6, #16] //load h2k | h1k
b .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_7: //blocks left > 7
rev64 v8.16b, v9.16b //GHASH final-7 block
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
eor v8.16b, v8.16b, v16.16b //feed in partial tag
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
.L192_dec_blocks_more_than_6: //blocks left > 6
rev64 v8.16b, v9.16b //GHASH final-6 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
.L192_dec_blocks_more_than_5: //blocks left > 5
rev64 v8.16b, v9.16b //GHASH final-5 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
movi v16.8b, #0 //supress further partial tag feed in
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
.L192_dec_blocks_more_than_4: //blocks left > 4
rev64 v8.16b, v9.16b //GHASH final-4 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
.L192_dec_blocks_more_than_3: //blocks left > 3
ldr q25, [x6, #80] //load h4l | h4h
rev64 v8.16b, v9.16b //GHASH final-3 block
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
movi v16.8b, #0 //supress further partial tag feed in
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
ldr q24, [x6, #64] //load h4k | h3k
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
.L192_dec_blocks_more_than_2: //blocks left > 2
rev64 v8.16b, v9.16b //GHASH final-2 block
ldr q23, [x6, #48] //load h3l | h3h
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
.L192_dec_blocks_more_than_1: //blocks left > 1
rev64 v8.16b, v9.16b //GHASH final-1 block
ldr q9, [x0], #16 //AES final block - load ciphertext
ldr q22, [x6, #32] //load h1l | h1h
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ldr q21, [x6, #16] //load h2k | h1k
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
.L192_dec_blocks_less_than_1: //blocks left <= 1
rev32 v30.16b, v30.16b
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
str q30, [x16] //store the updated counter
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
mvn x7, xzr //temp0_x = 0xffffffffffffffff
and x1, x1, #127 //bit_length %= 128
mvn x8, xzr //temp1_x = 0xffffffffffffffff
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
cmp x1, #64
csel x13, x8, x7, lt
csel x14, x7, xzr, lt
ldr q20, [x6] //load h1l | h1h
mov v0.d[1], x14
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
mov v0.d[0], x13 //ctr0b is mask for last block
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
rev64 v8.16b, v9.16b //GHASH final block
st1 { v12.16b}, [x2] //store all 16B
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v16.d[0], v8.d[1] //GHASH final block - mid
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
.inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
mov x0, x9
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L192_dec_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_dec_192,.-aesv8_gcm_8x_dec_192
.globl aesv8_gcm_8x_enc_256
.hidden aesv8_gcm_8x_enc_256
.type aesv8_gcm_8x_enc_256,%function
.align 4
aesv8_gcm_8x_enc_256:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_enc_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
ld1 { v0.16b}, [x16] //CTR block 0
mov x5, x9
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
sub x5, x5, #1 //byte_len - 1
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add x5, x5, x0
rev32 v30.16b, v0.16b //set up reversed counter
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
rev32 v7.16b, v30.16b //CTR block 7
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 9
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 10
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 9
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 10
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 11
ldp q26, q27, [x11, #192] //load rk12, rk13
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 11
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 11
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 11
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 11
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 11
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 7
ldr q28, [x11, #224] //load rk14
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 12
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 12
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 12
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 12
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 12
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 12
aese v2.16b, v27.16b //AES block 2 - round 13
aese v1.16b, v27.16b //AES block 1 - round 13
aese v4.16b, v27.16b //AES block 4 - round 13
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 12
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 12
aese v0.16b, v27.16b //AES block 0 - round 13
aese v5.16b, v27.16b //AES block 5 - round 13
aese v6.16b, v27.16b //AES block 6 - round 13
aese v7.16b, v27.16b //AES block 7 - round 13
aese v3.16b, v27.16b //AES block 3 - round 13
add x4, x0, x1, lsr #3 //end_input_ptr
cmp x0, x5 //check if we have <= 8 blocks
b.ge .L256_enc_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext
ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
rev32 v1.16b, v30.16b //CTR block 9
add v30.4s, v30.4s, v31.4s //CTR block 9
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
cmp x0, x5 //check if we have <= 8 blocks
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
stp q8, q9, [x2], #32 //AES block 0, 1 - store result
stp q10, q11, [x2], #32 //AES block 2, 3 - store result
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
rev32 v4.16b, v30.16b //CTR block 12
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
add v30.4s, v30.4s, v31.4s //CTR block 12
b.ge .L256_enc_prepretail //do prepretail
.L256_enc_main_loop: //main loop start
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev64 v11.16b, v11.16b //GHASH block 8k+3
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
rev64 v9.16b, v9.16b //GHASH block 8k+1
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
rev64 v8.16b, v8.16b //GHASH block 8k
rev64 v12.16b, v12.16b //GHASH block 8k+4
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
eor v8.16b, v8.16b, v19.16b //PRE 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
rev64 v14.16b, v14.16b //GHASH block 8k+6
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
ldp q27, q28, [x11, #64] //load rk4, rk5
rev64 v10.16b, v10.16b //GHASH block 8k+2
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
rev64 v13.16b, v13.16b //GHASH block 8k+5
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
rev64 v15.16b, v15.16b //GHASH block 8k+7
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
ldr d16, [x10] //MODULO - load modulo constant
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
ldp q26, q27, [x11, #192] //load rk12, rk13
rev32 v20.16b, v30.16b //CTR block 8k+16
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
rev32 v22.16b, v30.16b //CTR block 8k+17
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
ldr q28, [x11, #224] //load rk14
aese v7.16b, v27.16b //AES block 8k+15 - round 13
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext
ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext
aese v2.16b, v27.16b //AES block 8k+10 - round 13
aese v4.16b, v27.16b //AES block 8k+12 - round 13
rev32 v23.16b, v30.16b //CTR block 8k+18
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
aese v5.16b, v27.16b //AES block 8k+13 - round 13
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
aese v3.16b, v27.16b //AES block 8k+11 - round 13
cmp x0, x5 //.LOOP CONTROL
.inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
rev32 v25.16b, v30.16b //CTR block 8k+19
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
aese v0.16b, v27.16b //AES block 8k+8 - round 13
aese v6.16b, v27.16b //AES block 8k+14 - round 13
.inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v1.16b, v27.16b //AES block 8k+9 - round 13
.inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
rev32 v4.16b, v30.16b //CTR block 8k+20
.inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
mov v3.16b, v25.16b //CTR block 8k+19
.inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
.inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result
mov v2.16b, v23.16b //CTR block 8k+18
.inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result
.inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
mov v1.16b, v22.16b //CTR block 8k+17
stp q12, q13, [x2], #32 //AES block 4, 5 - store result
stp q14, q15, [x2], #32 //AES block 6, 7 - store result
mov v0.16b, v20.16b //CTR block 8k+16
b.lt .L256_enc_main_loop
.L256_enc_prepretail: //PREPRETAIL
rev32 v5.16b, v30.16b //CTR block 8k+13
ldp q26, q27, [x11, #0] //load rk0, rk1
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v10.16b, v10.16b //GHASH block 8k+2
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
rev64 v13.16b, v13.16b //GHASH block 8k+5
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev32 v7.16b, v30.16b //CTR block 8k+15
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v8.16b, v8.16b //GHASH block 8k
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
rev64 v9.16b, v9.16b //GHASH block 8k+1
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
eor v8.16b, v8.16b, v19.16b //PRE 1
rev64 v11.16b, v11.16b //GHASH block 8k+3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
rev64 v14.16b, v14.16b //GHASH block 8k+6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
rev64 v12.16b, v12.16b //GHASH block 8k+4
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
rev64 v15.16b, v15.16b //GHASH block 8k+7
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
ldp q27, q28, [x11, #160] //load rk10, rk11
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
ldr d16, [x10] //MODULO - load modulo constant
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
ldp q26, q27, [x11, #192] //load rk12, rk13
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
ldr q28, [x11, #224] //load rk14
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
aese v0.16b, v27.16b //AES block 8k+8 - round 13
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
aese v5.16b, v27.16b //AES block 8k+13 - round 13
aese v1.16b, v27.16b //AES block 8k+9 - round 13
aese v3.16b, v27.16b //AES block 8k+11 - round 13
aese v4.16b, v27.16b //AES block 8k+12 - round 13
aese v7.16b, v27.16b //AES block 8k+15 - round 13
aese v2.16b, v27.16b //AES block 8k+10 - round 13
aese v6.16b, v27.16b //AES block 8k+14 - round 13
.L256_enc_tail: //TAIL
ldp q24, q25, [x6, #160] //load h8l | h8h
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
ldr q8, [x0], #16 //AES block 8k+8 - load plaintext
ldp q20, q21, [x6, #96] //load h5l | h5h
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
ldp q22, q23, [x6, #128] //load h6l | h6h
mov v29.16b, v28.16b
cmp x5, #112
.inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result
b.gt .L256_enc_blocks_more_than_7
movi v19.8b, #0
mov v7.16b, v6.16b
movi v17.8b, #0
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v2.16b
sub v30.4s, v30.4s, v31.4s
mov v2.16b, v1.16b
movi v18.8b, #0
cmp x5, #96
b.gt .L256_enc_blocks_more_than_6
mov v7.16b, v6.16b
mov v6.16b, v5.16b
cmp x5, #80
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L256_enc_blocks_more_than_5
mov v7.16b, v6.16b
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v4.16b
cmp x5, #64
mov v4.16b, v1.16b
b.gt .L256_enc_blocks_more_than_4
cmp x5, #48
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L256_enc_blocks_more_than_3
cmp x5, #32
mov v7.16b, v6.16b
ldr q24, [x6, #64] //load h4k | h3k
mov v6.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
b.gt .L256_enc_blocks_more_than_2
mov v7.16b, v1.16b
sub v30.4s, v30.4s, v31.4s
cmp x5, #16
b.gt .L256_enc_blocks_more_than_1
sub v30.4s, v30.4s, v31.4s
ldr q21, [x6, #16] //load h2k | h1k
b .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_7: //blocks left > 7
st1 { v9.16b}, [x2], #16 //AES final-7 block - store result
rev64 v8.16b, v9.16b //GHASH final-7 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-6 block - load plaintext
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
.inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
.L256_enc_blocks_more_than_6: //blocks left > 6
st1 { v9.16b}, [x2], #16 //AES final-6 block - store result
rev64 v8.16b, v9.16b //GHASH final-6 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
ldr q9, [x0], #16 //AES final-5 block - load plaintext
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
.inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
movi v16.8b, #0 //supress further partial tag feed in
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
.L256_enc_blocks_more_than_5: //blocks left > 5
st1 { v9.16b}, [x2], #16 //AES final-5 block - store result
rev64 v8.16b, v9.16b //GHASH final-5 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
ldr q9, [x0], #16 //AES final-4 block - load plaintext
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
.inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
.L256_enc_blocks_more_than_4: //blocks left > 4
st1 { v9.16b}, [x2], #16 //AES final-4 block - store result
rev64 v8.16b, v9.16b //GHASH final-4 block
ldr q9, [x0], #16 //AES final-3 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
.inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
.L256_enc_blocks_more_than_3: //blocks left > 3
st1 { v9.16b}, [x2], #16 //AES final-3 block - store result
ldr q25, [x6, #80] //load h4l | h4h
rev64 v8.16b, v9.16b //GHASH final-3 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
ldr q24, [x6, #64] //load h4k | h3k
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
ldr q9, [x0], #16 //AES final-2 block - load plaintext
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
.inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
movi v16.8b, #0 //supress further partial tag feed in
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
.L256_enc_blocks_more_than_2: //blocks left > 2
ldr q23, [x6, #48] //load h3l | h3h
st1 { v9.16b}, [x2], #16 //AES final-2 block - store result
rev64 v8.16b, v9.16b //GHASH final-2 block
ldr q9, [x0], #16 //AES final-1 block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
.inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
.L256_enc_blocks_more_than_1: //blocks left > 1
st1 { v9.16b}, [x2], #16 //AES final-1 block - store result
ldr q22, [x6, #32] //load h2l | h2h
rev64 v8.16b, v9.16b //GHASH final-1 block
ldr q9, [x0], #16 //AES final block - load plaintext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
.inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
ldr q21, [x6, #16] //load h2k | h1k
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
.L256_enc_blocks_less_than_1: //blocks left <= 1
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
mvn x7, xzr //temp0_x = 0xffffffffffffffff
and x1, x1, #127 //bit_length %= 128
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
cmp x1, #64
mvn x8, xzr //temp1_x = 0xffffffffffffffff
csel x14, x7, xzr, lt
csel x13, x8, x7, lt
mov v0.d[0], x13 //ctr0b is mask for last block
ldr q20, [x6] //load h1l | h1h
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
mov v0.d[1], x14
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
rev64 v8.16b, v9.16b //GHASH final block
rev32 v30.16b, v30.16b
bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
str q30, [x16] //store the updated counter
eor v8.16b, v8.16b, v16.16b //feed in partial tag
st1 { v9.16b}, [x2] //store all 16B
ins v16.d[0], v8.d[1] //GHASH final block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
.inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
mov x0, x9 //return sizes
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L256_enc_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_enc_256,.-aesv8_gcm_8x_enc_256
.globl aesv8_gcm_8x_dec_256
.hidden aesv8_gcm_8x_dec_256
.type aesv8_gcm_8x_dec_256,%function
.align 4
aesv8_gcm_8x_dec_256:
AARCH64_VALID_CALL_TARGET
cbz x1, .L256_dec_ret
stp d8, d9, [sp, #-80]!
lsr x9, x1, #3
mov x16, x4
mov x11, x5
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
mov x5, #0xc200000000000000
stp x5, xzr, [sp, #64]
add x10, sp, #64
ld1 { v0.16b}, [x16] //CTR block 0
mov x15, #0x100000000 //set up counter increment
movi v31.16b, #0x0
mov v31.d[1], x15
mov x5, x9
sub x5, x5, #1 //byte_len - 1
rev32 v30.16b, v0.16b //set up reversed counter
add v30.4s, v30.4s, v31.4s //CTR block 0
rev32 v1.16b, v30.16b //CTR block 1
add v30.4s, v30.4s, v31.4s //CTR block 1
rev32 v2.16b, v30.16b //CTR block 2
add v30.4s, v30.4s, v31.4s //CTR block 2
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v3.16b, v30.16b //CTR block 3
add v30.4s, v30.4s, v31.4s //CTR block 3
rev32 v4.16b, v30.16b //CTR block 4
add v30.4s, v30.4s, v31.4s //CTR block 4
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 0
rev32 v5.16b, v30.16b //CTR block 5
add v30.4s, v30.4s, v31.4s //CTR block 5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 0
rev32 v6.16b, v30.16b //CTR block 6
add v30.4s, v30.4s, v31.4s //CTR block 6
rev32 v7.16b, v30.16b //CTR block 7
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 1
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 2
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 2
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 3
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 3
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 3
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 3
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 3
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 4
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 4
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 4
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 5
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 6
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 7
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 7
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 7
and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 8
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 8
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 9
ld1 { v19.16b}, [x3]
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
ldp q27, q28, [x11, #160] //load rk10, rk11
add x4, x0, x1, lsr #3 //end_input_ptr
add x5, x5, x0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 9
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 9
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 9
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 9
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 4 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 7 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 5 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 1 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 2 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 0 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 6 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 3 - round 10
ldp q26, q27, [x11, #192] //load rk12, rk13
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 0 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 7
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 7 - round 11
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 3 - round 11
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 1 - round 11
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 5 - round 11
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 4 - round 11
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 2 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 6 - round 11
ldr q28, [x11, #224] //load rk14
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 1 - round 12
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 4 - round 12
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 5 - round 12
cmp x0, x5 //check if we have <= 8 blocks
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 3 - round 12
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 2 - round 12
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 6 - round 12
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 0 - round 12
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 7 - round 12
aese v5.16b, v27.16b //AES block 5 - round 13
aese v1.16b, v27.16b //AES block 1 - round 13
aese v2.16b, v27.16b //AES block 2 - round 13
aese v0.16b, v27.16b //AES block 0 - round 13
aese v4.16b, v27.16b //AES block 4 - round 13
aese v6.16b, v27.16b //AES block 6 - round 13
aese v3.16b, v27.16b //AES block 3 - round 13
aese v7.16b, v27.16b //AES block 7 - round 13
b.ge .L256_dec_tail //handle tail
ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext
ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext
ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext
ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext
cmp x0, x5 //check if we have <= 8 blocks
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result
stp q0, q1, [x2], #32 //AES block 0, 1 - store result
rev32 v0.16b, v30.16b //CTR block 8
add v30.4s, v30.4s, v31.4s //CTR block 8
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result
rev32 v1.16b, v30.16b //CTR block 9
add v30.4s, v30.4s, v31.4s //CTR block 9
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result
stp q2, q3, [x2], #32 //AES block 2, 3 - store result
rev32 v2.16b, v30.16b //CTR block 10
add v30.4s, v30.4s, v31.4s //CTR block 10
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result
rev32 v3.16b, v30.16b //CTR block 11
add v30.4s, v30.4s, v31.4s //CTR block 11
stp q4, q5, [x2], #32 //AES block 4, 5 - store result
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result
stp q6, q7, [x2], #32 //AES block 6, 7 - store result
rev32 v4.16b, v30.16b //CTR block 12
add v30.4s, v30.4s, v31.4s //CTR block 12
b.ge .L256_dec_prepretail //do prepretail
.L256_dec_main_loop: //main loop start
rev32 v5.16b, v30.16b //CTR block 8k+13
ldp q26, q27, [x11, #0] //load rk0, rk1
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v9.16b, v9.16b //GHASH block 8k+1
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev32 v6.16b, v30.16b //CTR block 8k+14
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
rev64 v8.16b, v8.16b //GHASH block 8k
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
rev64 v12.16b, v12.16b //GHASH block 8k+4
rev64 v11.16b, v11.16b //GHASH block 8k+3
rev32 v7.16b, v30.16b //CTR block 8k+15
rev64 v15.16b, v15.16b //GHASH block 8k+7
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
eor v8.16b, v8.16b, v19.16b //PRE 1
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
rev64 v10.16b, v10.16b //GHASH block 8k+2
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
ldp q26, q27, [x11, #96] //load rk6, rk7
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
rev64 v13.16b, v13.16b //GHASH block 8k+5
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
rev64 v14.16b, v14.16b //GHASH block 8k+6
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
ldp q28, q26, [x11, #128] //load rk8, rk9
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
ldp q27, q28, [x11, #160] //load rk10, rk11
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
rev32 v20.16b, v30.16b //CTR block 8k+16
ldr d16, [x10] //MODULO - load modulo constant
add v30.4s, v30.4s, v31.4s //CTR block 8k+16
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
ldp q26, q27, [x11, #192] //load rk12, rk13
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
rev32 v22.16b, v30.16b //CTR block 8k+17
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
add v30.4s, v30.4s, v31.4s //CTR block 8k+17
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
rev32 v23.16b, v30.16b //CTR block 8k+18
add v30.4s, v30.4s, v31.4s //CTR block 8k+18
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
ldr q28, [x11, #224] //load rk14
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext
aese v1.16b, v27.16b //AES block 8k+9 - round 13
aese v2.16b, v27.16b //AES block 8k+10 - round 13
ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext
aese v0.16b, v27.16b //AES block 8k+8 - round 13
aese v5.16b, v27.16b //AES block 8k+13 - round 13
rev32 v25.16b, v30.16b //CTR block 8k+19
.inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result
.inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v7.16b, v27.16b //AES block 8k+15 - round 13
add v30.4s, v30.4s, v31.4s //CTR block 8k+19
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v4.16b, v27.16b //AES block 8k+12 - round 13
.inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result
.inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result
aese v3.16b, v27.16b //AES block 8k+11 - round 13
stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result
mov v0.16b, v20.16b //CTR block 8k+16
.inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
.inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result
stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result
mov v3.16b, v25.16b //CTR block 8k+19
mov v2.16b, v23.16b //CTR block 8k+18
aese v6.16b, v27.16b //AES block 8k+14 - round 13
mov v1.16b, v22.16b //CTR block 8k+17
stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result
.inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result
.inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result
rev32 v4.16b, v30.16b //CTR block 8k+20
add v30.4s, v30.4s, v31.4s //CTR block 8k+20
cmp x0, x5 //.LOOP CONTROL
stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result
b.lt .L256_dec_main_loop
.L256_dec_prepretail: //PREPRETAIL
ldp q26, q27, [x11, #0] //load rk0, rk1
rev32 v5.16b, v30.16b //CTR block 8k+13
add v30.4s, v30.4s, v31.4s //CTR block 8k+13
rev64 v12.16b, v12.16b //GHASH block 8k+4
ldr q21, [x6, #112] //load h6k | h5k
ldr q24, [x6, #160] //load h8k | h7k
rev32 v6.16b, v30.16b //CTR block 8k+14
rev64 v8.16b, v8.16b //GHASH block 8k
add v30.4s, v30.4s, v31.4s //CTR block 8k+14
ext v19.16b, v19.16b, v19.16b, #8 //PRE 0
ldr q23, [x6, #144] //load h7l | h7h
ldr q25, [x6, #176] //load h8l | h8h
rev64 v9.16b, v9.16b //GHASH block 8k+1
rev32 v7.16b, v30.16b //CTR block 8k+15
rev64 v10.16b, v10.16b //GHASH block 8k+2
ldr q20, [x6, #96] //load h5l | h5h
ldr q22, [x6, #128] //load h6l | h6h
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 0
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 0
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 0
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 0
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 0
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 0
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 1
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 0
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 0
ldp q28, q26, [x11, #32] //load rk2, rk3
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 1
eor v8.16b, v8.16b, v19.16b //PRE 1
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 1
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 1
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 1
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 1
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 1
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 1
pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high
trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low
rev64 v11.16b, v11.16b //GHASH block 8k+3
pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 2
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 2
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 2
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 2
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 2
pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 2
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 3
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 3
rev64 v14.16b, v14.16b //GHASH block 8k+6
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 3
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 2
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 3
pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high
trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 2
ldp q27, q28, [x11, #64] //load rk4, rk5
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 3
pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 3
eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high
eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 3
pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 3
.inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high
trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid
pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid
pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low
eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low
pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 4
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 4
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low
ldr q20, [x6] //load h1l | h1h
ldr q22, [x6, #32] //load h2l | h2h
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 4
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 4
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 4
eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid
eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 5
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 4
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 5
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 4
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 4
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 5
pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 5
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 5
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 5
pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 5
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 5
ldp q26, q27, [x11, #96] //load rk6, rk7
ldr q23, [x6, #48] //load h3l | h3h
ldr q25, [x6, #80] //load h4l | h4h
rev64 v15.16b, v15.16b //GHASH block 8k+7
rev64 v13.16b, v13.16b //GHASH block 8k+5
.inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid
trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 6
ldr q21, [x6, #16] //load h2k | h1k
ldr q24, [x6, #64] //load h4k | h3k
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 6
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 6
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 6
pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high
pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high
pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low
trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid
pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low
trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 7
pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 6
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 6
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 6
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 6
ldp q28, q26, [x11, #128] //load rk8, rk9
pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 7
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 7
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 7
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 7
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 7
.inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 7
trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 7
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 8
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 8
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 8
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 8
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 8
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 8
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 8
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 9
eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 9
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 9
eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 9
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 9
pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 8
pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid
pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high
pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid
pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid
pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low
ldp q27, q28, [x11, #160] //load rk10, rk11
.inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low
.inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 9
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 9
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 9
.inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high
.inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low
ldr d16, [x10] //MODULO - load modulo constant
.inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid
aese v4.16b, v27.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 10
aese v6.16b, v27.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 10
aese v5.16b, v27.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 10
aese v0.16b, v27.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 10
aese v2.16b, v27.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 10
aese v3.16b, v27.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 10
.inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
aese v7.16b, v27.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 10
aese v1.16b, v27.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 10
ldp q26, q27, [x11, #192] //load rk12, rk13
ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
aese v2.16b, v28.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 11
aese v1.16b, v28.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 11
aese v0.16b, v28.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 11
pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
aese v3.16b, v28.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 11
aese v7.16b, v28.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 11
aese v6.16b, v28.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 11
aese v4.16b, v28.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 11
aese v5.16b, v28.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 11
aese v3.16b, v26.16b
aesmc v3.16b, v3.16b //AES block 8k+11 - round 12
.inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid
aese v3.16b, v27.16b //AES block 8k+11 - round 13
aese v2.16b, v26.16b
aesmc v2.16b, v2.16b //AES block 8k+10 - round 12
aese v6.16b, v26.16b
aesmc v6.16b, v6.16b //AES block 8k+14 - round 12
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
aese v4.16b, v26.16b
aesmc v4.16b, v4.16b //AES block 8k+12 - round 12
aese v7.16b, v26.16b
aesmc v7.16b, v7.16b //AES block 8k+15 - round 12
aese v0.16b, v26.16b
aesmc v0.16b, v0.16b //AES block 8k+8 - round 12
ldr q28, [x11, #224] //load rk14
aese v1.16b, v26.16b
aesmc v1.16b, v1.16b //AES block 8k+9 - round 12
aese v4.16b, v27.16b //AES block 8k+12 - round 13
ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
aese v5.16b, v26.16b
aesmc v5.16b, v5.16b //AES block 8k+13 - round 12
aese v6.16b, v27.16b //AES block 8k+14 - round 13
aese v2.16b, v27.16b //AES block 8k+10 - round 13
aese v1.16b, v27.16b //AES block 8k+9 - round 13
aese v5.16b, v27.16b //AES block 8k+13 - round 13
.inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low
add v30.4s, v30.4s, v31.4s //CTR block 8k+15
aese v7.16b, v27.16b //AES block 8k+15 - round 13
aese v0.16b, v27.16b //AES block 8k+8 - round 13
.L256_dec_tail: //TAIL
ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag
sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process
cmp x5, #112
ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext
ldp q24, q25, [x6, #160] //load h8k | h7k
mov v29.16b, v28.16b
ldp q20, q21, [x6, #96] //load h5l | h5h
.inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result
ldp q22, q23, [x6, #128] //load h6l | h6h
b.gt .L256_dec_blocks_more_than_7
mov v7.16b, v6.16b
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
movi v19.8b, #0
movi v17.8b, #0
movi v18.8b, #0
mov v3.16b, v2.16b
cmp x5, #96
mov v2.16b, v1.16b
b.gt .L256_dec_blocks_more_than_6
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
cmp x5, #80
sub v30.4s, v30.4s, v31.4s
mov v4.16b, v3.16b
mov v3.16b, v1.16b
b.gt .L256_dec_blocks_more_than_5
cmp x5, #64
mov v7.16b, v6.16b
sub v30.4s, v30.4s, v31.4s
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v1.16b
b.gt .L256_dec_blocks_more_than_4
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
cmp x5, #48
mov v6.16b, v5.16b
mov v5.16b, v1.16b
b.gt .L256_dec_blocks_more_than_3
ldr q24, [x6, #64] //load h4k | h3k
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v6.16b
cmp x5, #32
mov v6.16b, v1.16b
b.gt .L256_dec_blocks_more_than_2
sub v30.4s, v30.4s, v31.4s
mov v7.16b, v1.16b
cmp x5, #16
b.gt .L256_dec_blocks_more_than_1
sub v30.4s, v30.4s, v31.4s
ldr q21, [x6, #16] //load h2k | h1k
b .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_7: //blocks left > 7
rev64 v8.16b, v9.16b //GHASH final-7 block
ldr q9, [x0], #16 //AES final-6 block - load ciphertext
st1 { v12.16b}, [x2], #16 //AES final-7 block - store result
ins v18.d[0], v24.d[1] //GHASH final-7 block - mid
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-7 block - mid
.inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result
pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid
movi v16.8b, #0 //supress further partial tag feed in
pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low
pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid
.L256_dec_blocks_more_than_6: //blocks left > 6
rev64 v8.16b, v9.16b //GHASH final-6 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-5 block - load ciphertext
movi v16.8b, #0 //supress further partial tag feed in
ins v27.d[0], v8.d[1] //GHASH final-6 block - mid
st1 { v12.16b}, [x2], #16 //AES final-6 block - store result
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high
pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low
.inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result
eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low
eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid
pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid
eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high
.L256_dec_blocks_more_than_5: //blocks left > 5
rev64 v8.16b, v9.16b //GHASH final-5 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high
ins v27.d[0], v8.d[1] //GHASH final-5 block - mid
ldr q9, [x0], #16 //AES final-4 block - load ciphertext
eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid
st1 { v12.16b}, [x2], #16 //AES final-5 block - store result
pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low
ins v27.d[1], v27.d[0] //GHASH final-5 block - mid
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high
.inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result
eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low
eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid
movi v16.8b, #0 //supress further partial tag feed in
.L256_dec_blocks_more_than_4: //blocks left > 4
rev64 v8.16b, v9.16b //GHASH final-4 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-4 block - mid
ldr q9, [x0], #16 //AES final-3 block - load ciphertext
movi v16.8b, #0 //supress further partial tag feed in
pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low
pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high
eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high
pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low
st1 { v12.16b}, [x2], #16 //AES final-4 block - store result
eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid
.inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result
.L256_dec_blocks_more_than_3: //blocks left > 3
ldr q25, [x6, #80] //load h4l | h4h
rev64 v8.16b, v9.16b //GHASH final-3 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ldr q9, [x0], #16 //AES final-2 block - load ciphertext
ldr q24, [x6, #64] //load h4k | h3k
ins v27.d[0], v8.d[1] //GHASH final-3 block - mid
st1 { v12.16b}, [x2], #16 //AES final-3 block - store result
.inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid
ins v27.d[1], v27.d[0] //GHASH final-3 block - mid
pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low
pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high
movi v16.8b, #0 //supress further partial tag feed in
pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low
eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid
.L256_dec_blocks_more_than_2: //blocks left > 2
rev64 v8.16b, v9.16b //GHASH final-2 block
ldr q23, [x6, #48] //load h3l | h3h
ldr q9, [x0], #16 //AES final-1 block - load ciphertext
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-2 block - mid
pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low
st1 { v12.16b}, [x2], #16 //AES final-2 block - store result
.inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result
eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low
movi v16.8b, #0 //supress further partial tag feed in
pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid
pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid
eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high
.L256_dec_blocks_more_than_1: //blocks left > 1
rev64 v8.16b, v9.16b //GHASH final-1 block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v27.d[0], v8.d[1] //GHASH final-1 block - mid
ldr q22, [x6, #32] //load h2l | h2h
eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid
ldr q9, [x0], #16 //AES final block - load ciphertext
st1 { v12.16b}, [x2], #16 //AES final-1 block - store result
ldr q21, [x6, #16] //load h2k | h1k
pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low
ins v27.d[1], v27.d[0] //GHASH final-1 block - mid
eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low
.inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result
pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high
pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid
movi v16.8b, #0 //supress further partial tag feed in
eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high
eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid
.L256_dec_blocks_less_than_1: //blocks left <= 1
ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored
mvn x7, xzr //temp0_x = 0xffffffffffffffff
and x1, x1, #127 //bit_length %= 128
sub x1, x1, #128 //bit_length -= 128
rev32 v30.16b, v30.16b
str q30, [x16] //store the updated counter
neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128])
and x1, x1, #127 //bit_length %= 128
lsr x7, x7, x1 //temp0_x is mask for top 64b of last block
cmp x1, #64
mvn x8, xzr //temp1_x = 0xffffffffffffffff
csel x14, x7, xzr, lt
csel x13, x8, x7, lt
mov v0.d[0], x13 //ctr0b is mask for last block
mov v0.d[1], x14
and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits
ldr q20, [x6] //load h1l | h1h
bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing
rev64 v8.16b, v9.16b //GHASH final block
eor v8.16b, v8.16b, v16.16b //feed in partial tag
ins v16.d[0], v8.d[1] //GHASH final block - mid
pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high
eor v16.8b, v16.8b, v8.8b //GHASH final block - mid
pmull v26.1q, v8.1d, v20.1d //GHASH final block - low
eor v17.16b, v17.16b, v28.16b //GHASH final block - high
pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid
eor v18.16b, v18.16b, v16.16b //GHASH final block - mid
ldr d16, [x10] //MODULO - load modulo constant
eor v19.16b, v19.16b, v26.16b //GHASH final block - low
pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid
eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up
ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment
st1 { v12.16b}, [x2] //store all 16B
eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up
eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid
eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid
pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low
ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment
eor v19.16b, v19.16b, v17.16b //MODULO - fold into low
eor v19.16b, v19.16b, v18.16b //MODULO - fold into low
ext v19.16b, v19.16b, v19.16b, #8
rev64 v19.16b, v19.16b
st1 { v19.16b }, [x3]
mov x0, x9
ldp d10, d11, [sp, #16]
ldp d12, d13, [sp, #32]
ldp d14, d15, [sp, #48]
ldp d8, d9, [sp], #80
ret
.L256_dec_ret:
mov w0, #0x0
ret
.size aesv8_gcm_8x_dec_256,.-aesv8_gcm_8x_dec_256
.byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0
.align 2
.align 2
#endif
#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)