1542 lines
103 KiB
Raku
1542 lines
103 KiB
Raku
#! /usr/bin/env perl
|
|
|
|
# Copyright (c) 2022, ARM Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# https://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
#========================================================================
|
|
# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
|
|
# derived from https://github.com/ARM-software/AArch64cryptolib, original
|
|
# author Samuel Lee <Samuel.Lee@arm.com>.
|
|
#========================================================================
|
|
#
|
|
# Approach - assume we don't want to reload constants, so reserve ~half of
|
|
# vector register file for constants
|
|
#
|
|
# main loop to act on 4 16B blocks per iteration, and then do modulo of the
|
|
# accumulated intermediate hashes from the 4 blocks
|
|
#
|
|
# ____________________________________________________
|
|
# | |
|
|
# | PRE |
|
|
# |____________________________________________________|
|
|
# | | | |
|
|
# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
|
|
# |________________|________________|__________________|
|
|
# | | | |
|
|
# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
|
|
# |________________|____(mostly)____|__________________|
|
|
# | |
|
|
# | MODULO |
|
|
# |____________________________________________________|
|
|
#
|
|
# PRE: Ensure previous generated intermediate hash is aligned and merged with
|
|
# result for GHASH 4k+0
|
|
#
|
|
# EXT low_acc, low_acc, low_acc, #8
|
|
# EOR res_curr (4k+0), res_curr (4k+0), low_acc
|
|
#
|
|
# CTR block: Increment and byte reverse counter in scalar registers and transfer
|
|
# to SIMD registers
|
|
#
|
|
# REV ctr32, rev_ctr32
|
|
# ORR ctr64, constctr96_top32, ctr32, LSL #32
|
|
# // Keeping this in scalar registers to free up space in SIMD RF
|
|
# INS ctr_next.d[0], constctr96_bottom64
|
|
# INS ctr_next.d[1], ctr64X
|
|
# ADD rev_ctr32, #1
|
|
#
|
|
# AES block:
|
|
#
|
|
# Do AES encryption/decryption on CTR block X and EOR it with input block X.
|
|
# Take 256 bytes key below for example. Doing small trick here of loading input
|
|
# in scalar registers, EORing with last key and then transferring Given we are
|
|
# very constrained in our ASIMD registers this is quite important
|
|
#
|
|
# Encrypt:
|
|
# LDR input_low, [ input_ptr ], #8
|
|
# LDR input_high, [ input_ptr ], #8
|
|
# EOR input_low, k14_low
|
|
# EOR input_high, k14_high
|
|
# INS res_curr.d[0], input_low
|
|
# INS res_curr.d[1], input_high
|
|
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k13
|
|
# EOR res_curr, res_curr, ctr_curr
|
|
# ST1 { res_curr.16b }, [ output_ptr ], #16
|
|
#
|
|
# Decrypt:
|
|
# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
|
|
# AESE ctr_curr, k13
|
|
# LDR res_curr, [ input_ptr ], #16
|
|
# EOR res_curr, res_curr, ctr_curr
|
|
# MOV output_low, res_curr.d[0]
|
|
# MOV output_high, res_curr.d[1]
|
|
# EOR output_low, k14_low
|
|
# EOR output_high, k14_high
|
|
# STP output_low, output_high, [ output_ptr ], #16
|
|
#
|
|
# GHASH block X:
|
|
# Do 128b karatsuba polynomial multiplication on block. We only have
|
|
# 64b->128b polynomial multipliers, naively that means we need to do 4 64b
|
|
# multiplies to generate a 128b.
|
|
#
|
|
# multiplication:
|
|
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
|
|
# (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
|
|
#
|
|
# The idea behind Karatsuba multiplication is that we can do just 3 64b
|
|
# multiplies:
|
|
# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^
|
|
# (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^
|
|
# Pmull(Al,Bl))<<64
|
|
#
|
|
# There is some complication here because the bit order of GHASH's PMULL is
|
|
# reversed compared to elsewhere, so we are multiplying with "twisted"
|
|
# powers of H
|
|
#
|
|
# Note: We can PMULL directly into the acc_x in first GHASH of the loop
|
|
#
|
|
# Note: For scheduling big cores we want to split the processing to happen over
|
|
# two loop iterations - otherwise the critical path latency dominates the
|
|
# performance.
|
|
#
|
|
# This has a knock on effect on register pressure, so we have to be a bit
|
|
# more clever with our temporary registers than indicated here
|
|
#
|
|
# REV64 res_curr, res_curr
|
|
# INS t_m.d[0], res_curr.d[1]
|
|
# EOR t_m.8B, t_m.8B, res_curr.8B
|
|
# PMULL2 t_h, res_curr, HX
|
|
# PMULL t_l, res_curr, HX
|
|
# PMULL t_m, t_m, HX_k
|
|
# EOR acc_h, acc_h, t_h
|
|
# EOR acc_l, acc_l, t_l
|
|
# EOR acc_m, acc_m, t_m
|
|
#
|
|
# MODULO: take the partial accumulators (~representing sum of 256b
|
|
# multiplication results), from GHASH and do modulo reduction on them
|
|
# There is some complication here because the bit order of GHASH's
|
|
# PMULL is reversed compared to elsewhere, so we are doing modulo with
|
|
# a reversed constant
|
|
#
|
|
# EOR acc_m, acc_m, acc_h
|
|
# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
|
|
# PMULL t_mod, acc_h, mod_constant
|
|
# EXT acc_h, acc_h, acc_h, #8
|
|
# EOR acc_m, acc_m, acc_h
|
|
# EOR acc_m, acc_m, t_mod
|
|
# PMULL acc_h, acc_m, mod_constant
|
|
# EXT acc_m, acc_m, acc_m, #8
|
|
# EOR acc_l, acc_l, acc_h
|
|
# EOR acc_l, acc_l, acc_m
|
|
#
|
|
# This code was then modified to merge the AES-128-GCM, AES-192-GCM, and
|
|
# AES-256-GCM implementations into a single function to reduce size. We move the
|
|
# last two round keys into consistent registers across all sizes, as they're
|
|
# treated special. Then, after rounds 0 through 8, we added some branches to
|
|
# conditionally run rounds 9-10 (AES-192 + AES-256) and 11-12 (AES-256), before
|
|
# merging back into code which finishes up the last two rounds.
|
|
#
|
|
# There is a mostly decision to be made around how much parallel work goes
|
|
# before or after the conditional part. We attempted to preserve the original
|
|
# scheduling where possible, but it's possible other schedulings are more
|
|
# optimal with the current ordering.
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT=*OUT;
|
|
|
|
$code=<<___;
|
|
#if __ARM_MAX_ARCH__ >= 8
|
|
|
|
.arch armv8-a+crypto
|
|
.text
|
|
___
|
|
|
|
$input_ptr="x0"; #argument block
|
|
$bit_length="x1";
|
|
$output_ptr="x2";
|
|
$current_tag="x3";
|
|
$Htable="x6";
|
|
$counter="x16";
|
|
$cc="x8";
|
|
|
|
{
|
|
my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
|
|
my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
|
|
my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
|
|
my ($output_l0,$output_h0)=map("x$_",(6..7));
|
|
|
|
# rkN_l and rkN_h store the final round key, which is handled slightly
|
|
# differently because it is EORed through general-purpose registers.
|
|
my $ctr32w="w9";
|
|
my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rkN_l,$rkN_h,$len)=map("x$_",(9..15));
|
|
my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
|
|
|
|
my $rounds="x17";
|
|
my $roundsw="w17";
|
|
|
|
my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
|
|
my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
|
|
my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
|
|
my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
|
|
|
|
my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
|
|
my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
|
|
my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
|
|
|
|
my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
|
|
my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
|
|
my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
|
|
|
|
my $t0="v8";
|
|
my $t0d="d8";
|
|
my $t1="v4";
|
|
my $t1d="d4";
|
|
my $t2="v8";
|
|
my $t2d="d8";
|
|
my $t3="v4";
|
|
my $t3d="d4";
|
|
my $t4="v4";
|
|
my $t4d="d4";
|
|
my $t5="v5";
|
|
my $t5d="d5";
|
|
my $t6="v8";
|
|
my $t6d="d8";
|
|
my $t7="v5";
|
|
my $t7d="d5";
|
|
my $t8="v6";
|
|
my $t8d="d6";
|
|
my $t9="v4";
|
|
my $t9d="d4";
|
|
|
|
my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
|
|
my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
|
|
my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
|
|
|
|
my $mod_constantd="d8";
|
|
my $mod_constant="v8";
|
|
my $mod_t="v7";
|
|
|
|
# rkNm1 stores the second-to-last round key, which is handled slightly
|
|
# differently because it uses plain AESE instead of an AESE + AESMC macro-op.
|
|
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rkNm1)=map("v$_.16b",(18..31));
|
|
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rkNm1q)=map("q$_",(18..31));
|
|
my $rk2q1="v20.1q";
|
|
my $rk3q1="v21.1q";
|
|
my $rk4v="v22";
|
|
my $rk4d="d22";
|
|
|
|
################################################################################
|
|
# size_t aes_gcm_enc_kernel(const uint8_t *in,
|
|
# size_t len_bits,
|
|
# uint8_t *out,
|
|
# u64 *Xi,
|
|
# uint8_t ivec[16],
|
|
# const void *key,
|
|
# const void *Htable);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_enc_kernel
|
|
.type aes_gcm_enc_kernel,%function
|
|
.align 4
|
|
aes_gcm_enc_kernel:
|
|
AARCH64_SIGN_LINK_REGISTER
|
|
stp x29, x30, [sp, #-128]!
|
|
mov x29, sp
|
|
stp x19, x20, [sp, #16]
|
|
mov $counter, x4
|
|
mov $cc, x5
|
|
stp x21, x22, [sp, #32]
|
|
stp x23, x24, [sp, #48]
|
|
stp d8, d9, [sp, #64]
|
|
stp d10, d11, [sp, #80]
|
|
stp d12, d13, [sp, #96]
|
|
stp d14, d15, [sp, #112]
|
|
ldr $roundsw, [$cc, #240]
|
|
add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key
|
|
ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys
|
|
ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr
|
|
lsr $main_end_input_ptr, $bit_length, #3 // byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32
|
|
ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1
|
|
ldr $rk0q, [$cc, #0] // load rk0
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
ldr $rk7q, [$cc, #112] // load rk7
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 2
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
rev $rctr32w, $rctr32w // rev_ctr32
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 1
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0
|
|
add $rctr32w, $rctr32w, #1 // increment rev_ctr32
|
|
rev $ctr32w, $rctr32w // CTR block 1
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 3
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1
|
|
add $rctr32w, $rctr32w, #1 // CTR block 1
|
|
ldr $rk1q, [$cc, #16] // load rk1
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 1
|
|
rev $ctr32w, $rctr32w // CTR block 2
|
|
add $rctr32w, $rctr32w, #1 // CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2
|
|
ldr $rk2q, [$cc, #32] // load rk2
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 2
|
|
rev $ctr32w, $rctr32w // CTR block 3
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 3
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0
|
|
ldr $rk3q, [$cc, #48] // load rk3
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2
|
|
ldr $rk6q, [$cc, #96] // load rk6
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0
|
|
ldr $rk5q, [$cc, #80] // load rk5
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1
|
|
ldr $h3q, [$Htable, #48] // load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1
|
|
ldr $rk4q, [$cc, #64] // load rk4
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2
|
|
ldr $h2q, [$Htable, #32] // load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1
|
|
ldr $rk12q, [$cc, #192] // load rk12
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2
|
|
ldr $h4q, [$Htable, #80] // load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3
|
|
ldr $rk11q, [$cc, #176] // load rk11
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2
|
|
ldr $rk8q, [$cc, #128] // load rk8
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3
|
|
add $rctr32w, $rctr32w, #1 // CTR block 3
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6
|
|
ldr $rk9q, [$cc, #144] // load rk9
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6
|
|
ldr $h1q, [$Htable] // load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6
|
|
ldr $rk10q, [$cc, #160] // load rk10
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8
|
|
b.lt .Lenc_finish_first_blocks // branch if AES-128
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10
|
|
b.eq .Lenc_finish_first_blocks // branch if AES-192
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12
|
|
|
|
.Lenc_finish_first_blocks:
|
|
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k
|
|
aese $ctr2b, $rkNm1 // AES block 2 - round N-1
|
|
trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h
|
|
aese $ctr1b, $rkNm1 // AES block 1 - round N-1
|
|
aese $ctr0b, $rkNm1 // AES block 0 - round N-1
|
|
aese $ctr3b, $rkNm1 // AES block 3 - round N-1
|
|
eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k
|
|
b.ge .Lenc_tail // handle tail
|
|
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 1 - load plaintext
|
|
rev $ctr32w, $rctr32w // CTR block 4
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 0 - load plaintext
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 3 - load plaintext
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 2 - load plaintext
|
|
add $input_ptr, $input_ptr, #64 // AES input_ptr update
|
|
eor $input_l1, $input_l1, $rkN_l // AES block 1 - round N low
|
|
eor $input_h1, $input_h1, $rkN_h // AES block 1 - round N high
|
|
fmov $ctr_t1d, $input_l1 // AES block 1 - mov low
|
|
eor $input_l0, $input_l0, $rkN_l // AES block 0 - round N low
|
|
eor $input_h0, $input_h0, $rkN_h // AES block 0 - round N high
|
|
eor $input_h3, $input_h3, $rkN_h // AES block 3 - round N high
|
|
fmov $ctr_t0d, $input_l0 // AES block 0 - mov low
|
|
cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks
|
|
fmov $ctr_t0.d[1], $input_h0 // AES block 0 - mov high
|
|
eor $input_l3, $input_l3, $rkN_l // AES block 3 - round N low
|
|
eor $input_l2, $input_l2, $rkN_l // AES block 2 - round N low
|
|
fmov $ctr_t1.d[1], $input_h1 // AES block 1 - mov high
|
|
fmov $ctr_t2d, $input_l2 // AES block 2 - mov low
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4
|
|
fmov $ctr_t3d, $input_l3 // AES block 3 - mov low
|
|
eor $input_h2, $input_h2, $rkN_h // AES block 2 - round N high
|
|
fmov $ctr_t2.d[1], $input_h2 // AES block 2 - mov high
|
|
eor $res0b, $ctr_t0b, $ctr0b // AES block 0 - result
|
|
fmov $ctr0d, $ctr96_b64x // CTR block 4
|
|
fmov $ctr0.d[1], $ctr32x // CTR block 4
|
|
rev $ctr32w, $rctr32w // CTR block 5
|
|
add $rctr32w, $rctr32w, #1 // CTR block 5
|
|
eor $res1b, $ctr_t1b, $ctr1b // AES block 1 - result
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 5
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 5
|
|
rev $ctr32w, $rctr32w // CTR block 6
|
|
st1 { $res0b}, [$output_ptr], #16 // AES block 0 - store result
|
|
fmov $ctr_t3.d[1], $input_h3 // AES block 3 - mov high
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6
|
|
eor $res2b, $ctr_t2b, $ctr2b // AES block 2 - result
|
|
st1 { $res1b}, [$output_ptr], #16 // AES block 1 - store result
|
|
add $rctr32w, $rctr32w, #1 // CTR block 6
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 6
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 6
|
|
st1 { $res2b}, [$output_ptr], #16 // AES block 2 - store result
|
|
rev $ctr32w, $rctr32w // CTR block 7
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 7
|
|
eor $res3b, $ctr_t3b, $ctr3b // AES block 3 - result
|
|
st1 { $res3b}, [$output_ptr], #16 // AES block 3 - store result
|
|
b.ge .Lenc_prepretail // do prepretail
|
|
|
|
.Lenc_main_loop: // main loop start
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0
|
|
rev64 $res0b, $res0b // GHASH block 4k (only t0 is free)
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 4k+3
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 4k+3
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1
|
|
ldp $input_l3, $input_h3, [$input_ptr, #48] // AES block 4k+7 - load plaintext
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1
|
|
ldp $input_l2, $input_h2, [$input_ptr, #32] // AES block 4k+6 - load plaintext
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2
|
|
eor $res0b, $res0b, $acc_lb // PRE 1
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0
|
|
eor $input_l3, $input_l3, $rkN_l // AES block 4k+7 - round N low
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3
|
|
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high
|
|
eor $input_h2, $input_h2, $rkN_h // AES block 4k+6 - round N high
|
|
mov $t0d, $res0.d[1] // GHASH block 4k - mid
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1
|
|
rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free)
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5
|
|
rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid
|
|
rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free)
|
|
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
|
|
eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high
|
|
mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2
|
|
eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4
|
|
mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3
|
|
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6
|
|
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4
|
|
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5
|
|
ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6
|
|
eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
|
|
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7
|
|
pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
|
|
eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6
|
|
ldp $input_l1, $input_h1, [$input_ptr, #16] // AES block 4k+5 - load plaintext
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8
|
|
mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6
|
|
eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
|
|
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7
|
|
eor $input_l1, $input_l1, $rkN_l // AES block 4k+5 - round N low
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8
|
|
eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7
|
|
eor $input_l2, $input_l2, $rkN_l // AES block 4k+6 - round N low
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8
|
|
movi $mod_constant.8b, #0xc2
|
|
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid
|
|
eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
fmov $ctr_t1d, $input_l1 // AES block 4k+5 - mov low
|
|
ldp $input_l0, $input_h0, [$input_ptr, #0] // AES block 4k+4 - load plaintext
|
|
b.lt .Lenc_main_loop_continue // branch if AES-128
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10
|
|
b.eq .Lenc_main_loop_continue // branch if AES-192
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12
|
|
|
|
.Lenc_main_loop_continue:
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low
|
|
eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+3
|
|
eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up
|
|
add $input_ptr, $input_ptr, #64 // AES input_ptr update
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
|
|
rev $ctr32w, $rctr32w // CTR block 4k+8
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment
|
|
eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low
|
|
eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up
|
|
eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high
|
|
fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8
|
|
eor $mod_t.16b, $acc_hb, $mod_t.16b // MODULO - fold into mid
|
|
eor $input_h1, $input_h1, $rkN_h // AES block 4k+5 - round N high
|
|
eor $input_h3, $input_h3, $rkN_h // AES block 4k+7 - round N high
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+8
|
|
aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1
|
|
fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high
|
|
eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid
|
|
fmov $ctr_t3d, $input_l3 // AES block 4k+7 - mov low
|
|
aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1
|
|
fmov $ctr_t1.d[1], $input_h1 // AES block 4k+5 - mov high
|
|
fmov $ctr_t2d, $input_l2 // AES block 4k+6 - mov low
|
|
cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL
|
|
fmov $ctr_t2.d[1], $input_h2 // AES block 4k+6 - mov high
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
|
|
eor $res0b, $ctr_t0b, $ctr0b // AES block 4k+4 - result
|
|
fmov $ctr0d, $ctr96_b64x // CTR block 4k+8
|
|
fmov $ctr0.d[1], $ctr32x // CTR block 4k+8
|
|
rev $ctr32w, $rctr32w // CTR block 4k+9
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+9
|
|
eor $res1b, $ctr_t1b, $ctr1b // AES block 4k+5 - result
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 4k+9
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 4k+9
|
|
aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1
|
|
rev $ctr32w, $rctr32w // CTR block 4k+10
|
|
st1 { $res0b}, [$output_ptr], #16 // AES block 4k+4 - store result
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10
|
|
eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low
|
|
fmov $ctr_t3.d[1], $input_h3 // AES block 4k+7 - mov high
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment
|
|
st1 { $res1b}, [$output_ptr], #16 // AES block 4k+5 - store result
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+10
|
|
aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1
|
|
eor $res2b, $ctr_t2b, $ctr2b // AES block 4k+6 - result
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 4k+10
|
|
st1 { $res2b}, [$output_ptr], #16 // AES block 4k+6 - store result
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 4k+10
|
|
rev $ctr32w, $rctr32w // CTR block 4k+11
|
|
eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+11
|
|
eor $res3b, $ctr_t3b, $ctr3b // AES block 4k+7 - result
|
|
st1 { $res3b}, [$output_ptr], #16 // AES block 4k+7 - store result
|
|
b.lt .Lenc_main_loop
|
|
|
|
.Lenc_prepretail: // PREPRETAIL
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0
|
|
rev64 $res2b, $res2b // GHASH block 4k+2 (t0, t1, and t2 free)
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 4k+3
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0
|
|
rev64 $res0b, $res0b // GHASH block 4k (only t0 is free)
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 4k+3
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1
|
|
eor $res0b, $res0b, $acc_lb // PRE 1
|
|
rev64 $res1b, $res1b // GHASH block 4k+1 (t0 and t1 free)
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0
|
|
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] // GHASH block 4k - mid
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
|
|
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2
|
|
eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high
|
|
mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3
|
|
eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3
|
|
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid
|
|
mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4
|
|
rev64 $res3b, $res3b // GHASH block 4k+3 (t0, t1, t2 and t3 free)
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4
|
|
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid
|
|
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+3
|
|
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4
|
|
eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
|
|
eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low
|
|
ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5
|
|
eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid
|
|
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5
|
|
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid
|
|
eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6
|
|
movi $mod_constant.8b, #0xc2
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7
|
|
eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8
|
|
eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid
|
|
pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8
|
|
eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7
|
|
eor $acc_mb, $acc_mb, $acc_hb // karatsuba tidy up
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8
|
|
pmull $t1.1q, $acc_h.1d, $mod_constant.1d
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8
|
|
eor $acc_mb, $acc_mb, $acc_lb
|
|
b.lt .Lenc_finish_prepretail // branch if AES-128
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10
|
|
b.eq .Lenc_finish_prepretail // branch if AES-192
|
|
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12
|
|
|
|
.Lenc_finish_prepretail:
|
|
eor $acc_mb, $acc_mb, $t1.16b
|
|
eor $acc_mb, $acc_mb, $acc_hb
|
|
pmull $t1.1q, $acc_m.1d, $mod_constant.1d
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8
|
|
aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1
|
|
eor $acc_lb, $acc_lb, $t1.16b
|
|
aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1
|
|
aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1
|
|
aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1
|
|
eor $acc_lb, $acc_lb, $acc_mb
|
|
|
|
.Lenc_tail: // TAIL
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES block 4k+4 - load plaintext
|
|
eor $input_l0, $input_l0, $rkN_l // AES block 4k+4 - round N low
|
|
eor $input_h0, $input_h0, $rkN_h // AES block 4k+4 - round N high
|
|
cmp $main_end_input_ptr, #48
|
|
fmov $ctr_t0d, $input_l0 // AES block 4k+4 - mov low
|
|
fmov $ctr_t0.d[1], $input_h0 // AES block 4k+4 - mov high
|
|
eor $res1b, $ctr_t0b, $ctr0b // AES block 4k+4 - result
|
|
b.gt .Lenc_blocks_more_than_3
|
|
cmp $main_end_input_ptr, #32
|
|
mov $ctr3b, $ctr2b
|
|
movi $acc_l.8b, #0
|
|
movi $acc_h.8b, #0
|
|
sub $rctr32w, $rctr32w, #1
|
|
mov $ctr2b, $ctr1b
|
|
movi $acc_m.8b, #0
|
|
b.gt .Lenc_blocks_more_than_2
|
|
mov $ctr3b, $ctr1b
|
|
sub $rctr32w, $rctr32w, #1
|
|
cmp $main_end_input_ptr, #16
|
|
b.gt .Lenc_blocks_more_than_1
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .Lenc_blocks_less_than_1
|
|
.Lenc_blocks_more_than_3: // blocks left > 3
|
|
st1 { $res1b}, [$output_ptr], #16 // AES final-3 block - store result
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-2 block - load input low & high
|
|
rev64 $res0b, $res1b // GHASH final-3 block
|
|
eor $input_l0, $input_l0, $rkN_l // AES final-2 block - round N low
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
eor $input_h0, $input_h0, $rkN_h // AES final-2 block - round N high
|
|
mov $rk4d, $res0.d[1] // GHASH final-3 block - mid
|
|
fmov $res1d, $input_l0 // AES final-2 block - mov low
|
|
fmov $res1.d[1], $input_h0 // AES final-2 block - mov high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid
|
|
eor $res1b, $res1b, $ctr1b // AES final-2 block - result
|
|
.Lenc_blocks_more_than_2: // blocks left > 2
|
|
st1 { $res1b}, [$output_ptr], #16 // AES final-2 block - store result
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final-1 block - load input low & high
|
|
rev64 $res0b, $res1b // GHASH final-2 block
|
|
eor $input_l0, $input_l0, $rkN_l // AES final-1 block - round N low
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
fmov $res1d, $input_l0 // AES final-1 block - mov low
|
|
eor $input_h0, $input_h0, $rkN_h // AES final-1 block - round N high
|
|
fmov $res1.d[1], $input_h0 // AES final-1 block - mov high
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high
|
|
mov $rk4d, $res0.d[1] // GHASH final-2 block - mid
|
|
pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid
|
|
eor $res1b, $res1b, $ctr2b // AES final-1 block - result
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low
|
|
eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid
|
|
.Lenc_blocks_more_than_1: // blocks left > 1
|
|
st1 { $res1b}, [$output_ptr], #16 // AES final-1 block - store result
|
|
rev64 $res0b, $res1b // GHASH final-1 block
|
|
ldp $input_l0, $input_h0, [$input_ptr], #16 // AES final block - load input low & high
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
eor $input_l0, $input_l0, $rkN_l // AES final block - round N low
|
|
mov $rk4d, $res0.d[1] // GHASH final-1 block - mid
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high
|
|
eor $input_h0, $input_h0, $rkN_h // AES final block - round N high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high
|
|
ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid
|
|
fmov $res1d, $input_l0 // AES final block - mov low
|
|
fmov $res1.d[1], $input_h0 // AES final block - mov high
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid
|
|
pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low
|
|
eor $res1b, $res1b, $ctr3b // AES final block - result
|
|
eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low
|
|
.Lenc_blocks_less_than_1: // blocks left <= 1
|
|
and $bit_length, $bit_length, #127 // bit_length %= 128
|
|
mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff
|
|
sub $bit_length, $bit_length, #128 // bit_length -= 128
|
|
neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128])
|
|
ld1 { $rk0}, [$output_ptr] // load existing bytes where the possibly partial last block is to be stored
|
|
mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff
|
|
and $bit_length, $bit_length, #127 // bit_length %= 128
|
|
lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
csel $input_l0, $rkN_l, $rkN_h, lt
|
|
csel $input_h0, $rkN_h, xzr, lt
|
|
fmov $ctr0d, $input_l0 // ctr0b is mask for last block
|
|
fmov $ctr0.d[1], $input_h0
|
|
and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits
|
|
rev64 $res0b, $res1b // GHASH final block
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
bif $res1b, $rk0, $ctr0b // insert existing bytes in top end of result before storing
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high
|
|
mov $t0d, $res0.d[1] // GHASH final block - mid
|
|
rev $ctr32w, $rctr32w
|
|
pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid
|
|
pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low
|
|
eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment
|
|
eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid
|
|
eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid
|
|
pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment
|
|
str $ctr32w, [$counter, #12] // store the updated counter
|
|
st1 { $res1b}, [$output_ptr] // store all 16B
|
|
eor $acc_lb, $acc_lb, $acc_hb // MODULO - fold into low
|
|
eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
ldp x19, x20, [sp, #16]
|
|
ldp x21, x22, [sp, #32]
|
|
ldp x23, x24, [sp, #48]
|
|
ldp d8, d9, [sp, #64]
|
|
ldp d10, d11, [sp, #80]
|
|
ldp d12, d13, [sp, #96]
|
|
ldp d14, d15, [sp, #112]
|
|
ldp x29, x30, [sp], #128
|
|
AARCH64_VALIDATE_LINK_REGISTER
|
|
ret
|
|
.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel
|
|
___
|
|
|
|
{
|
|
my $t8="v4";
|
|
my $t8d="d4";
|
|
my $t9="v6";
|
|
my $t9d="d6";
|
|
################################################################################
|
|
# size_t aes_gcm_dec_kernel(const uint8_t *in,
|
|
# size_t len_bits,
|
|
# uint8_t *out,
|
|
# u64 *Xi,
|
|
# uint8_t ivec[16],
|
|
# const void *key);
|
|
#
|
|
$code.=<<___;
|
|
.global aes_gcm_dec_kernel
|
|
.type aes_gcm_dec_kernel,%function
|
|
.align 4
|
|
aes_gcm_dec_kernel:
|
|
AARCH64_SIGN_LINK_REGISTER
|
|
stp x29, x30, [sp, #-128]!
|
|
mov x29, sp
|
|
stp x19, x20, [sp, #16]
|
|
mov $counter, x4
|
|
mov $cc, x5
|
|
stp x21, x22, [sp, #32]
|
|
stp x23, x24, [sp, #48]
|
|
stp d8, d9, [sp, #64]
|
|
stp d10, d11, [sp, #80]
|
|
stp d12, d13, [sp, #96]
|
|
stp d14, d15, [sp, #112]
|
|
ldr $roundsw, [$cc, #240]
|
|
add $input_l1, $cc, $rounds, lsl #4 // borrow input_l1 for last key
|
|
ldp $rkN_l, $rkN_h, [$input_l1] // load round N keys
|
|
ldr $rkNm1q, [$input_l1, #-16] // load round N-1 keys
|
|
lsr $main_end_input_ptr, $bit_length, #3 // byte_len
|
|
mov $len, $main_end_input_ptr
|
|
ldp $ctr96_b64x, $ctr96_t32x, [$counter] // ctr96_b64, ctr96_t32
|
|
ldr $rk8q, [$cc, #128] // load rk8
|
|
sub $main_end_input_ptr, $main_end_input_ptr, #1 // byte_len - 1
|
|
ldr $rk7q, [$cc, #112] // load rk7
|
|
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
|
|
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 // end_input_ptr
|
|
ldr $rk6q, [$cc, #96] // load rk6
|
|
lsr $rctr32x, $ctr96_t32x, #32
|
|
ldr $rk5q, [$cc, #80] // load rk5
|
|
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
|
|
ldr $rk3q, [$cc, #48] // load rk3
|
|
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
|
|
rev $rctr32w, $rctr32w // rev_ctr32
|
|
add $rctr32w, $rctr32w, #1 // increment rev_ctr32
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 3
|
|
rev $ctr32w, $rctr32w // CTR block 1
|
|
add $rctr32w, $rctr32w, #1 // CTR block 1
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 1
|
|
ld1 { $ctr0b}, [$counter] // special case vector load initial counter so we can start first AES block as quickly as possible
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 1
|
|
rev $ctr32w, $rctr32w // CTR block 2
|
|
add $rctr32w, $rctr32w, #1 // CTR block 2
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 2
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 2
|
|
rev $ctr32w, $rctr32w // CTR block 3
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 3
|
|
ldr $rk0q, [$cc, #0] // load rk0
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 3
|
|
add $rctr32w, $rctr32w, #1 // CTR block 3
|
|
ldr $rk4q, [$cc, #64] // load rk4
|
|
ldr $rk1q, [$cc, #16] // load rk1
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 0
|
|
ldr $h3q, [$Htable, #48] // load h3l | h3h
|
|
ext $h3b, $h3b, $h3b, #8
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 0
|
|
ldr $h4q, [$Htable, #80] // load h4l | h4h
|
|
ext $h4b, $h4b, $h4b, #8
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 0
|
|
ldr $h2q, [$Htable, #32] // load h2l | h2h
|
|
ext $h2b, $h2b, $h2b, #8
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 0
|
|
ldr $rk2q, [$cc, #32] // load rk2
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 1
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 1
|
|
ld1 { $acc_lb}, [$current_tag]
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 1
|
|
ldr $rk9q, [$cc, #144] // load rk9
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 1
|
|
ldr $rk12q, [$cc, #192] // load rk12
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 2
|
|
ldr $h1q, [$Htable] // load h1l | h1h
|
|
ext $h1b, $h1b, $h1b, #8
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 2
|
|
ldr $rk10q, [$cc, #160] // load rk10
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 2
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 3
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 2
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 3
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 4
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 3
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 3
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 4
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 4
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 4
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 5
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 5
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 5
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 5
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 6
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 6
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 6
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 6
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 7
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 7
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 7
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 8
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 7
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 8
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 8
|
|
ldr $rk11q, [$cc, #176] // load rk11
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 8
|
|
b.lt .Ldec_finish_first_blocks // branch if AES-128
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 9
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 9
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 10
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 10
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 10
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 10
|
|
b.eq .Ldec_finish_first_blocks // branch if AES-192
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 11
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 11
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 11
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 11
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 1 - round 12
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 0 - round 12
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 2 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 3 - round 12
|
|
|
|
.Ldec_finish_first_blocks:
|
|
cmp $input_ptr, $main_end_input_ptr // check if we have <= 4 blocks
|
|
trn1 $acc_h.2d, $h3.2d, $h4.2d // h4h | h3h
|
|
trn2 $h34k.2d, $h3.2d, $h4.2d // h4l | h3l
|
|
trn1 $t0.2d, $h1.2d, $h2.2d // h2h | h1h
|
|
trn2 $h12k.2d, $h1.2d, $h2.2d // h2l | h1l
|
|
eor $h34k.16b, $h34k.16b, $acc_h.16b // h4k | h3k
|
|
aese $ctr1b, $rkNm1 // AES block 1 - round N-1
|
|
aese $ctr2b, $rkNm1 // AES block 2 - round N-1
|
|
eor $h12k.16b, $h12k.16b, $t0.16b // h2k | h1k
|
|
aese $ctr3b, $rkNm1 // AES block 3 - round N-1
|
|
aese $ctr0b, $rkNm1 // AES block 0 - round N-1
|
|
b.ge .Ldec_tail // handle tail
|
|
|
|
ldr $res0q, [$input_ptr, #0] // AES block 0 - load ciphertext
|
|
ldr $res1q, [$input_ptr, #16] // AES block 1 - load ciphertext
|
|
rev $ctr32w, $rctr32w // CTR block 4
|
|
eor $ctr0b, $res0b, $ctr0b // AES block 0 - result
|
|
eor $ctr1b, $res1b, $ctr1b // AES block 1 - result
|
|
rev64 $res1b, $res1b // GHASH block 1
|
|
ldr $res3q, [$input_ptr, #48] // AES block 3 - load ciphertext
|
|
mov $output_h0, $ctr0.d[1] // AES block 0 - mov high
|
|
mov $output_l0, $ctr0.d[0] // AES block 0 - mov low
|
|
rev64 $res0b, $res0b // GHASH block 0
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4
|
|
fmov $ctr0d, $ctr96_b64x // CTR block 4
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4
|
|
fmov $ctr0.d[1], $ctr32x // CTR block 4
|
|
rev $ctr32w, $rctr32w // CTR block 5
|
|
add $rctr32w, $rctr32w, #1 // CTR block 5
|
|
mov $output_l1, $ctr1.d[0] // AES block 1 - mov low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 5
|
|
mov $output_h1, $ctr1.d[1] // AES block 1 - mov high
|
|
eor $output_h0, $output_h0, $rkN_h // AES block 0 - round N high
|
|
eor $output_l0, $output_l0, $rkN_l // AES block 0 - round N low
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 0 - store result
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 5
|
|
ldr $res2q, [$input_ptr, #32] // AES block 2 - load ciphertext
|
|
add $input_ptr, $input_ptr, #64 // AES input_ptr update
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 5
|
|
rev $ctr32w, $rctr32w // CTR block 6
|
|
add $rctr32w, $rctr32w, #1 // CTR block 6
|
|
eor $output_l1, $output_l1, $rkN_l // AES block 1 - round N low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 6
|
|
eor $output_h1, $output_h1, $rkN_h // AES block 1 - round N high
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 1 - store result
|
|
eor $ctr2b, $res2b, $ctr2b // AES block 2 - result
|
|
cmp $input_ptr, $main_end_input_ptr // check if we have <= 8 blocks
|
|
b.ge .Ldec_prepretail // do prepretail
|
|
|
|
.Ldec_main_loop: // main loop start
|
|
mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0
|
|
eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0
|
|
mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 4k+6
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 4k+6
|
|
eor $res0b, $res0b, $acc_lb // PRE 1
|
|
rev $ctr32w, $rctr32w // CTR block 4k+7
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1
|
|
mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1
|
|
mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high
|
|
mov $t0d, $res0.d[1] // GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 4k+7
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 4k+7
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3
|
|
eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1
|
|
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3
|
|
rev64 $res2b, $res2b // GHASH block 4k+2
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0
|
|
eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3
|
|
rev64 $res3b, $res3b // GHASH block 4k+3
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid
|
|
eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low
|
|
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
|
|
eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high
|
|
eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1
|
|
mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4
|
|
eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+7
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2
|
|
mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4
|
|
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid
|
|
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3
|
|
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5
|
|
eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low
|
|
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid
|
|
rev $ctr32w, $rctr32w // CTR block 4k+8
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6
|
|
ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+8
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7
|
|
eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
|
|
mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8
|
|
eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6
|
|
pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+8
|
|
eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6
|
|
eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high
|
|
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7
|
|
eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8
|
|
eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8
|
|
b.lt .Ldec_main_loop_continue // branch if AES-128
|
|
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10
|
|
b.eq .Ldec_main_loop_continue // branch if AES-192
|
|
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12
|
|
|
|
.Ldec_main_loop_continue:
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
|
|
eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up
|
|
ldr $res0q, [$input_ptr, #0] // AES block 4k+4 - load ciphertext
|
|
aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment
|
|
eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up
|
|
ldr $res1q, [$input_ptr, #16] // AES block 4k+5 - load ciphertext
|
|
eor $ctr0b, $res0b, $ctr0b // AES block 4k+4 - result
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result
|
|
eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid
|
|
ldr $res3q, [$input_ptr, #48] // AES block 4k+7 - load ciphertext
|
|
ldr $res2q, [$input_ptr, #32] // AES block 4k+6 - load ciphertext
|
|
mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high
|
|
eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid
|
|
aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1
|
|
add $input_ptr, $input_ptr, #64 // AES input_ptr update
|
|
mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low
|
|
fmov $ctr0d, $ctr96_b64x // CTR block 4k+8
|
|
fmov $ctr0.d[1], $ctr32x // CTR block 4k+8
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
|
|
eor $ctr1b, $res1b, $ctr1b // AES block 4k+5 - result
|
|
rev $ctr32w, $rctr32w // CTR block 4k+9
|
|
aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+9
|
|
cmp $input_ptr, $main_end_input_ptr // LOOP CONTROL
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+9
|
|
eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low
|
|
eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high
|
|
mov $output_h1, $ctr1.d[1] // AES block 4k+5 - mov high
|
|
eor $ctr2b, $res2b, $ctr2b // AES block 4k+6 - result
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low
|
|
mov $output_l1, $ctr1.d[0] // AES block 4k+5 - mov low
|
|
fmov $ctr1d, $ctr96_b64x // CTR block 4k+9
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment
|
|
fmov $ctr1.d[1], $ctr32x // CTR block 4k+9
|
|
rev $ctr32w, $rctr32w // CTR block 4k+10
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+10
|
|
aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+10
|
|
rev64 $res1b, $res1b // GHASH block 4k+5
|
|
eor $output_h1, $output_h1, $rkN_h // AES block 4k+5 - round N high
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 // AES block 4k+4 - store result
|
|
eor $output_l1, $output_l1, $rkN_l // AES block 4k+5 - round N low
|
|
stp $output_l1, $output_h1, [$output_ptr], #16 // AES block 4k+5 - store result
|
|
rev64 $res0b, $res0b // GHASH block 4k+4
|
|
eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low
|
|
b.lt .Ldec_main_loop
|
|
|
|
.Ldec_prepretail: // PREPRETAIL
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8 // PRE 0
|
|
mov $output_l2, $ctr2.d[0] // AES block 4k+2 - mov low
|
|
eor $ctr3b, $res3b, $ctr3b // AES block 4k+3 - result
|
|
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 0
|
|
mov $output_h2, $ctr2.d[1] // AES block 4k+2 - mov high
|
|
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 0
|
|
fmov $ctr2d, $ctr96_b64x // CTR block 4k+6
|
|
fmov $ctr2.d[1], $ctr32x // CTR block 4k+6
|
|
rev $ctr32w, $rctr32w // CTR block 4k+7
|
|
eor $res0b, $res0b, $acc_lb // PRE 1
|
|
rev64 $res2b, $res2b // GHASH block 4k+2
|
|
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 // CTR block 4k+7
|
|
mov $output_l3, $ctr3.d[0] // AES block 4k+3 - mov low
|
|
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 1
|
|
mov $output_h3, $ctr3.d[1] // AES block 4k+3 - mov high
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH block 4k - low
|
|
mov $t0d, $res0.d[1] // GHASH block 4k - mid
|
|
fmov $ctr3d, $ctr96_b64x // CTR block 4k+7
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH block 4k - high
|
|
fmov $ctr3.d[1], $ctr32x // CTR block 4k+7
|
|
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 0
|
|
mov $acc_md, $h34k.d[1] // GHASH block 4k - mid
|
|
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 1
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH block 4k - mid
|
|
pmull2 $t1.1q, $res1.2d, $h3.2d // GHASH block 4k+1 - high
|
|
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 1
|
|
rev64 $res3b, $res3b // GHASH block 4k+3
|
|
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 0
|
|
pmull $acc_m.1q, $t0.1d, $acc_m.1d // GHASH block 4k - mid
|
|
eor $acc_hb, $acc_hb, $t1.16b // GHASH block 4k+1 - high
|
|
pmull $t2.1q, $res1.1d, $h3.1d // GHASH block 4k+1 - low
|
|
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 1
|
|
mov $t3d, $res1.d[1] // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 2
|
|
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 2
|
|
eor $acc_lb, $acc_lb, $t2.16b // GHASH block 4k+1 - low
|
|
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 2
|
|
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 3
|
|
mov $t6d, $res2.d[1] // GHASH block 4k+2 - mid
|
|
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 2
|
|
eor $t3.8b, $t3.8b, $res1.8b // GHASH block 4k+1 - mid
|
|
pmull $t5.1q, $res2.1d, $h2.1d // GHASH block 4k+2 - low
|
|
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 4
|
|
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 3
|
|
eor $t6.8b, $t6.8b, $res2.8b // GHASH block 4k+2 - mid
|
|
pmull $t3.1q, $t3.1d, $h34k.1d // GHASH block 4k+1 - mid
|
|
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 5
|
|
eor $acc_lb, $acc_lb, $t5.16b // GHASH block 4k+2 - low
|
|
aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 4
|
|
pmull2 $t7.1q, $res3.2d, $h1.2d // GHASH block 4k+3 - high
|
|
eor $acc_mb, $acc_mb, $t3.16b // GHASH block 4k+1 - mid
|
|
pmull2 $t4.1q, $res2.2d, $h2.2d // GHASH block 4k+2 - high
|
|
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 5
|
|
ins $t6.d[1], $t6.d[0] // GHASH block 4k+2 - mid
|
|
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 3
|
|
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 3
|
|
eor $acc_hb, $acc_hb, $t4.16b // GHASH block 4k+2 - high
|
|
pmull $t8.1q, $res3.1d, $h1.1d // GHASH block 4k+3 - low
|
|
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 4
|
|
mov $t9d, $res3.d[1] // GHASH block 4k+3 - mid
|
|
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 4
|
|
pmull2 $t6.1q, $t6.2d, $h12k.2d // GHASH block 4k+2 - mid
|
|
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 5
|
|
eor $t9.8b, $t9.8b, $res3.8b // GHASH block 4k+3 - mid
|
|
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 5
|
|
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 6
|
|
eor $acc_mb, $acc_mb, $t6.16b // GHASH block 4k+2 - mid
|
|
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 6
|
|
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 6
|
|
movi $mod_constant.8b, #0xc2
|
|
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 6
|
|
eor $acc_lb, $acc_lb, $t8.16b // GHASH block 4k+3 - low
|
|
pmull $t9.1q, $t9.1d, $h12k.1d // GHASH block 4k+3 - mid
|
|
aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 7
|
|
cmp $rounds, #12 // setup flags for AES-128/192/256 check
|
|
eor $acc_hb, $acc_hb, $t7.16b // GHASH block 4k+3 - high
|
|
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 7
|
|
aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 7
|
|
eor $acc_mb, $acc_mb, $t9.16b // GHASH block 4k+3 - mid
|
|
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 8
|
|
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 7
|
|
eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up
|
|
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 8
|
|
aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 8
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 8
|
|
b.lt .Ldec_finish_prepretail // branch if AES-128
|
|
|
|
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 9
|
|
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 9
|
|
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 9
|
|
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 9
|
|
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 10
|
|
aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 10
|
|
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 10
|
|
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 10
|
|
b.eq .Ldec_finish_prepretail // branch if AES-192
|
|
|
|
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 11
|
|
aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 11
|
|
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 11
|
|
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b // AES block 4k+6 - round 12
|
|
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 11
|
|
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b // AES block 4k+5 - round 12
|
|
aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b // AES block 4k+4 - round 12
|
|
aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b // AES block 4k+7 - round 12
|
|
|
|
.Ldec_finish_prepretail:
|
|
eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment
|
|
eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid
|
|
eor $output_h2, $output_h2, $rkN_h // AES block 4k+2 - round N high
|
|
eor $output_l3, $output_l3, $rkN_l // AES block 4k+3 - round N low
|
|
eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid
|
|
add $rctr32w, $rctr32w, #1 // CTR block 4k+7
|
|
eor $output_l2, $output_l2, $rkN_l // AES block 4k+2 - round N low
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
|
|
eor $output_h3, $output_h3, $rkN_h // AES block 4k+3 - round N high
|
|
stp $output_l2, $output_h2, [$output_ptr], #16 // AES block 4k+2 - store result
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment
|
|
stp $output_l3, $output_h3, [$output_ptr], #16 // AES block 4k+3 - store result
|
|
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low
|
|
aese $ctr1b, $rkNm1 // AES block 4k+5 - round N-1
|
|
aese $ctr0b, $rkNm1 // AES block 4k+4 - round N-1
|
|
aese $ctr3b, $rkNm1 // AES block 4k+7 - round N-1
|
|
aese $ctr2b, $rkNm1 // AES block 4k+6 - round N-1
|
|
eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low
|
|
|
|
.Ldec_tail: // TAIL
|
|
sub $main_end_input_ptr, $end_input_ptr, $input_ptr // main_end_input_ptr is number of bytes left to process
|
|
ld1 { $res1b}, [$input_ptr], #16 // AES block 4k+4 - load ciphertext
|
|
eor $ctr0b, $res1b, $ctr0b // AES block 4k+4 - result
|
|
mov $output_l0, $ctr0.d[0] // AES block 4k+4 - mov low
|
|
mov $output_h0, $ctr0.d[1] // AES block 4k+4 - mov high
|
|
ext $t0.16b, $acc_lb, $acc_lb, #8 // prepare final partial tag
|
|
cmp $main_end_input_ptr, #48
|
|
eor $output_l0, $output_l0, $rkN_l // AES block 4k+4 - round N low
|
|
eor $output_h0, $output_h0, $rkN_h // AES block 4k+4 - round N high
|
|
b.gt .Ldec_blocks_more_than_3
|
|
sub $rctr32w, $rctr32w, #1
|
|
mov $ctr3b, $ctr2b
|
|
movi $acc_m.8b, #0
|
|
movi $acc_l.8b, #0
|
|
cmp $main_end_input_ptr, #32
|
|
movi $acc_h.8b, #0
|
|
mov $ctr2b, $ctr1b
|
|
b.gt .Ldec_blocks_more_than_2
|
|
sub $rctr32w, $rctr32w, #1
|
|
mov $ctr3b, $ctr1b
|
|
cmp $main_end_input_ptr, #16
|
|
b.gt .Ldec_blocks_more_than_1
|
|
sub $rctr32w, $rctr32w, #1
|
|
b .Ldec_blocks_less_than_1
|
|
.Ldec_blocks_more_than_3: // blocks left > 3
|
|
rev64 $res0b, $res1b // GHASH final-3 block
|
|
ld1 { $res1b}, [$input_ptr], #16 // AES final-2 block - load ciphertext
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-3 block - store result
|
|
mov $acc_md, $h34k.d[1] // GHASH final-3 block - mid
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
eor $ctr0b, $res1b, $ctr1b // AES final-2 block - result
|
|
mov $rk4d, $res0.d[1] // GHASH final-3 block - mid
|
|
mov $output_l0, $ctr0.d[0] // AES final-2 block - mov low
|
|
mov $output_h0, $ctr0.d[1] // AES final-2 block - mov high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-3 block - mid
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
pmull2 $acc_h.1q, $res0.2d, $h4.2d // GHASH final-3 block - high
|
|
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d // GHASH final-3 block - mid
|
|
eor $output_l0, $output_l0, $rkN_l // AES final-2 block - round N low
|
|
pmull $acc_l.1q, $res0.1d, $h4.1d // GHASH final-3 block - low
|
|
eor $output_h0, $output_h0, $rkN_h // AES final-2 block - round N high
|
|
.Ldec_blocks_more_than_2: // blocks left > 2
|
|
rev64 $res0b, $res1b // GHASH final-2 block
|
|
ld1 { $res1b}, [$input_ptr], #16 // AES final-1 block - load ciphertext
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-2 block - store result
|
|
eor $ctr0b, $res1b, $ctr2b // AES final-1 block - result
|
|
mov $rk4d, $res0.d[1] // GHASH final-2 block - mid
|
|
pmull $rk3q1, $res0.1d, $h3.1d // GHASH final-2 block - low
|
|
pmull2 $rk2q1, $res0.2d, $h3.2d // GHASH final-2 block - high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-2 block - mid
|
|
mov $output_l0, $ctr0.d[0] // AES final-1 block - mov low
|
|
mov $output_h0, $ctr0.d[1] // AES final-1 block - mov high
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final-2 block - low
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
pmull $rk4v.1q, $rk4v.1d, $h34k.1d // GHASH final-2 block - mid
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final-2 block - high
|
|
eor $output_l0, $output_l0, $rkN_l // AES final-1 block - round N low
|
|
eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-2 block - mid
|
|
eor $output_h0, $output_h0, $rkN_h // AES final-1 block - round N high
|
|
.Ldec_blocks_more_than_1: // blocks left > 1
|
|
stp $output_l0, $output_h0, [$output_ptr], #16 // AES final-1 block - store result
|
|
rev64 $res0b, $res1b // GHASH final-1 block
|
|
ld1 { $res1b}, [$input_ptr], #16 // AES final block - load ciphertext
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
movi $t0.8b, #0 // suppress further partial tag feed in
|
|
mov $rk4d, $res0.d[1] // GHASH final-1 block - mid
|
|
eor $ctr0b, $res1b, $ctr3b // AES final block - result
|
|
pmull2 $rk2q1, $res0.2d, $h2.2d // GHASH final-1 block - high
|
|
eor $rk4v.8b, $rk4v.8b, $res0.8b // GHASH final-1 block - mid
|
|
pmull $rk3q1, $res0.1d, $h2.1d // GHASH final-1 block - low
|
|
mov $output_l0, $ctr0.d[0] // AES final block - mov low
|
|
ins $rk4v.d[1], $rk4v.d[0] // GHASH final-1 block - mid
|
|
mov $output_h0, $ctr0.d[1] // AES final block - mov high
|
|
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d // GHASH final-1 block - mid
|
|
eor $output_l0, $output_l0, $rkN_l // AES final block - round N low
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final-1 block - low
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final-1 block - high
|
|
eor $acc_mb, $acc_mb, $rk4v.16b // GHASH final-1 block - mid
|
|
eor $output_h0, $output_h0, $rkN_h // AES final block - round N high
|
|
.Ldec_blocks_less_than_1: // blocks left <= 1
|
|
and $bit_length, $bit_length, #127 // bit_length %= 128
|
|
mvn $rkN_h, xzr // rkN_h = 0xffffffffffffffff
|
|
sub $bit_length, $bit_length, #128 // bit_length -= 128
|
|
mvn $rkN_l, xzr // rkN_l = 0xffffffffffffffff
|
|
ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] // load existing bytes we need to not overwrite
|
|
neg $bit_length, $bit_length // bit_length = 128 - #bits in input (in range [1,128])
|
|
and $bit_length, $bit_length, #127 // bit_length %= 128
|
|
lsr $rkN_h, $rkN_h, $bit_length // rkN_h is mask for top 64b of last block
|
|
cmp $bit_length, #64
|
|
csel $ctr32x, $rkN_l, $rkN_h, lt
|
|
csel $ctr96_b64x, $rkN_h, xzr, lt
|
|
fmov $ctr0d, $ctr32x // ctr0b is mask for last block
|
|
and $output_l0, $output_l0, $ctr32x
|
|
mov $ctr0.d[1], $ctr96_b64x
|
|
bic $end_input_ptr, $end_input_ptr, $ctr32x // mask out low existing bytes
|
|
rev $ctr32w, $rctr32w
|
|
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x // mask out high existing bytes
|
|
orr $output_l0, $output_l0, $end_input_ptr
|
|
and $output_h0, $output_h0, $ctr96_b64x
|
|
orr $output_h0, $output_h0, $main_end_input_ptr
|
|
and $res1b, $res1b, $ctr0b // possibly partial last block has zeroes in highest bits
|
|
rev64 $res0b, $res1b // GHASH final block
|
|
eor $res0b, $res0b, $t0.16b // feed in partial tag
|
|
pmull $rk3q1, $res0.1d, $h1.1d // GHASH final block - low
|
|
mov $t0d, $res0.d[1] // GHASH final block - mid
|
|
eor $t0.8b, $t0.8b, $res0.8b // GHASH final block - mid
|
|
pmull2 $rk2q1, $res0.2d, $h1.2d // GHASH final block - high
|
|
pmull $t0.1q, $t0.1d, $h12k.1d // GHASH final block - mid
|
|
eor $acc_hb, $acc_hb, $rk2 // GHASH final block - high
|
|
eor $acc_lb, $acc_lb, $rk3 // GHASH final block - low
|
|
eor $acc_mb, $acc_mb, $t0.16b // GHASH final block - mid
|
|
movi $mod_constant.8b, #0xc2
|
|
eor $t9.16b, $acc_lb, $acc_hb // MODULO - karatsuba tidy up
|
|
shl $mod_constantd, $mod_constantd, #56 // mod_constant
|
|
eor $acc_mb, $acc_mb, $t9.16b // MODULO - karatsuba tidy up
|
|
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d // MODULO - top 64b align with mid
|
|
ext $acc_hb, $acc_hb, $acc_hb, #8 // MODULO - other top alignment
|
|
eor $acc_mb, $acc_mb, $mod_t.16b // MODULO - fold into mid
|
|
eor $acc_mb, $acc_mb, $acc_hb // MODULO - fold into mid
|
|
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d // MODULO - mid 64b align with low
|
|
ext $acc_mb, $acc_mb, $acc_mb, #8 // MODULO - other mid alignment
|
|
eor $acc_lb, $acc_lb, $mod_constant.16b // MODULO - fold into low
|
|
stp $output_l0, $output_h0, [$output_ptr]
|
|
str $ctr32w, [$counter, #12] // store the updated counter
|
|
eor $acc_lb, $acc_lb, $acc_mb // MODULO - fold into low
|
|
ext $acc_lb, $acc_lb, $acc_lb, #8
|
|
rev64 $acc_lb, $acc_lb
|
|
mov x0, $len
|
|
st1 { $acc_l.16b }, [$current_tag]
|
|
ldp x19, x20, [sp, #16]
|
|
ldp x21, x22, [sp, #32]
|
|
ldp x23, x24, [sp, #48]
|
|
ldp d8, d9, [sp, #64]
|
|
ldp d10, d11, [sp, #80]
|
|
ldp d12, d13, [sp, #96]
|
|
ldp d14, d15, [sp, #112]
|
|
ldp x29, x30, [sp], #128
|
|
AARCH64_VALIDATE_LINK_REGISTER
|
|
ret
|
|
.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel
|
|
___
|
|
}
|
|
}
|
|
|
|
$code.=<<___;
|
|
#endif
|
|
___
|
|
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|