chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,298 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# April 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
# experimental performance data available yet. The only approximation
# that can be made at this point is based on code size. Inner loop is
# 32 instructions long and on single-issue core should execute in <40
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
# loop, this assembler loop body was found to be ~3x smaller than
# compiler-generated one...
#
# July 2010
#
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
#
# April 2014
#
# Switch to multiplication algorithm suggested in paper referred
# below and combine it with reduction algorithm from x86 module.
# Performance improvement over previous version varies from 65% on
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
# Snapdragon S4 - in 9.33.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
# - performance improvement won't be anywhere near 50%, because 128-
# bit shift operation is neatly fused with 128-bit xor here, and
# "538B" variant would eliminate only 4-5 instructions out of 32
# in the inner loop (meaning that estimated improvement is ~15%);
# - ARM-based systems are often embedded ones and extra memory
# consumption might be unappreciated (for so little improvement);
#
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in Htable,
# namely with *least* significant dword of 128-bit value at *lower*
# address. This differs completely from C code and has everything to
# do with ldm instruction and order in which dwords are "consumed" by
# algorithm. *Byte* order within these dwords in turn is whatever
# *native* byte order on current platform. See gcm128.c for working
# example...
# This file was patched in BoringSSL to remove the variable-time 4-bit
# implementation.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$code=<<___;
#include <openssl/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
@ instructions are in aesv8-armx.pl.)
.arch armv7-a
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
___
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
sub clmul64x64 {
my ($r,$a,$b)=@_;
$code.=<<___;
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
___
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
vshr.u64 $Hlo,$IN#lo,#63
vshr.s8 $t1,#7 @ broadcast carry bit
vshl.i64 $IN,$IN,#1
vand $t0,$t0,$t1
vorr $IN#hi,$Hlo @ H<<<=1
veor $IN,$IN,$t0 @ twisted H
vstmia r0,{$IN}
ret @ bx lr
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
vld1.64 $IN#hi,[$Xi]! @ load Xi
vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
mov $len,#16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 $Xl#hi,[$Xi]! @ load Xi
vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
vmov.i64 $k16,#0x000000000000ffff
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
.Loop_neon:
vld1.64 $IN#hi,[$inp]! @ load inp
vld1.64 $IN#lo,[$inp]!
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $IN,$Xl @ inp^=Xi
.Lgmult_neon:
___
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
$code.=<<___;
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
___
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
$code.=<<___;
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
veor $Xm,$Xm,$Xh
veor $Xl#hi,$Xl#hi,$Xm#lo
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 $t1,$Xl,#57 @ 1st phase
vshl.i64 $t2,$Xl,#62
veor $t2,$t2,$t1 @
vshl.i64 $t1,$Xl,#63
veor $t2, $t2, $t1 @
veor $Xl#hi,$Xl#hi,$t2#lo @
veor $Xh#lo,$Xh#lo,$t2#hi
vshr.u64 $t2,$Xl,#1 @ 2nd phase
veor $Xh,$Xh,$Xl
veor $Xl,$Xl,$t2 @
vshr.u64 $t2,$t2,#6
vshr.u64 $Xl,$Xl,#1 @
veor $Xl,$Xl,$Xh @
veor $Xl,$Xl,$t2 @
subs $len,#16
bne .Loop_neon
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,290 @@
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It
# implements the multiplication algorithm described in:
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
#
# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is
# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit
# NEON, the low and high halves of the 128-bit register q0 are accessible as
# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of
# vN. Where the 32-bit version would use the upper half, this file must keep
# halves in separate registers.
#
# The other distinction is in syntax. 32-bit NEON embeds lane information in the
# instruction name, while AArch64 uses suffixes on the registers. For instance,
# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written:
#
# vshl.i64 q0, q0, #1
#
# in 64-bit, it would be written:
#
# shl v0.2d, v0.2d, #1
#
# See Programmer's Guide for ARMv8-A, section 7 for details.
# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf
#
# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ
# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials
# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit
# polynomial and is conditioned on the PMULL extension. This file emulates the
# latter with the former.
use strict;
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
my $flavour = shift;
my $output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block
my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4));
my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7));
# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers
# to spare.
my ($t0, $t1, $t2, $t3) = map("v$_", (16..19));
my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23));
my ($k48_k32, $k16_k0) = map("v$_", (24..25));
my $code = "";
# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b
# must be distinct from $t* and $k*. $t* are clobbered by the emitted code.
sub clmul64x64 {
my ($r, $a, $b) = @_;
$code .= <<___;
ext $t0.8b, $a.8b, $a.8b, #1 // A1
pmull $t0.8h, $t0.8b, $b.8b // F = A1*B
ext $r.8b, $b.8b, $b.8b, #1 // B1
pmull $r.8h, $a.8b, $r.8b // E = A*B1
ext $t1.8b, $a.8b, $a.8b, #2 // A2
pmull $t1.8h, $t1.8b, $b.8b // H = A2*B
ext $t3.8b, $b.8b, $b.8b, #2 // B2
pmull $t3.8h, $a.8b, $t3.8b // G = A*B2
ext $t2.8b, $a.8b, $a.8b, #3 // A3
eor $t0.16b, $t0.16b, $r.16b // L = E + F
pmull $t2.8h, $t2.8b, $b.8b // J = A3*B
ext $r.8b, $b.8b, $b.8b, #3 // B3
eor $t1.16b, $t1.16b, $t3.16b // M = G + H
pmull $r.8h, $a.8b, $r.8b // I = A*B3
// Here we diverge from the 32-bit version. It computes the following
// (instructions reordered for clarity):
//
// veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L)
// vand \$t0#hi, \$t0#hi, \$k48
// veor \$t0#lo, \$t0#lo, \$t0#hi
//
// veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M)
// vand \$t1#hi, \$t1#hi, \$k32
// veor \$t1#lo, \$t1#lo, \$t1#hi
//
// veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N)
// vand \$t2#hi, \$t2#hi, \$k16
// veor \$t2#lo, \$t2#lo, \$t2#hi
//
// veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K)
// vmov.i64 \$t3#hi, #0
//
// \$kN is a mask with the bottom N bits set. AArch64 cannot compute on
// upper halves of SIMD registers, so we must split each half into
// separate registers. To compensate, we pair computations up and
// parallelize.
ext $t3.8b, $b.8b, $b.8b, #4 // B4
eor $t2.16b, $t2.16b, $r.16b // N = I + J
pmull $t3.8h, $a.8b, $t3.8b // K = A*B4
// This can probably be scheduled more efficiently. For now, we just
// pair up independent instructions.
zip1 $t0l_t1l.2d, $t0.2d, $t1.2d
zip1 $t2l_t3l.2d, $t2.2d, $t3.2d
zip2 $t0h_t1h.2d, $t0.2d, $t1.2d
zip2 $t2h_t3h.2d, $t2.2d, $t3.2d
eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b
and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b
eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b
eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b
zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d
zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d
zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d
zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d
ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8
ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16
pmull $r.8h, $a.8b, $b.8b // D = A*B
ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32
ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24
eor $t0.16b, $t0.16b, $t1.16b
eor $t2.16b, $t2.16b, $t3.16b
eor $r.16b, $r.16b, $t0.16b
eor $r.16b, $r.16b, $t2.16b
___
}
$code .= <<___;
#include <openssl/arm_arch.h>
.text
.global gcm_init_neon
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
AARCH64_VALID_CALL_TARGET
// This function is adapted from gcm_init_v8. xC2 is t3.
ld1 {$t1.2d}, [x1] // load H
movi $t3.16b, #0xe1
shl $t3.2d, $t3.2d, #57 // 0xc2.0
ext $INlo.16b, $t1.16b, $t1.16b, #8
ushr $t2.2d, $t3.2d, #63
dup $t1.4s, $t1.s[1]
ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01
ushr $t2.2d, $INlo.2d, #63
sshr $t1.4s, $t1.4s, #31 // broadcast carry bit
and $t2.16b, $t2.16b, $t0.16b
shl $INlo.2d, $INlo.2d, #1
ext $t2.16b, $t2.16b, $t2.16b, #8
and $t0.16b, $t0.16b, $t1.16b
orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1
eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H
st1 {$Hlo.2d}, [x0] // store Htable[0]
ret
.size gcm_init_neon,.-gcm_init_neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
AARCH64_VALID_CALL_TARGET
ld1 {$INlo.16b}, [$Xi] // load Xi
ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
ld1 {$Hhi.1d}, [$Htbl]
adrp x9, :pg_hi21:.Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
rev64 $INlo.16b, $INlo.16b // byteswap Xi
ext $INlo.16b, $INlo.16b, $INlo.16b, #8
eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
mov $len, #16
b .Lgmult_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
AARCH64_VALID_CALL_TARGET
ld1 {$Xl.16b}, [$Xi] // load Xi
ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H
ld1 {$Hhi.1d}, [$Htbl]
adrp x9, :pg_hi21:.Lmasks // load constants
add x9, x9, :lo12:.Lmasks
ld1 {$k48_k32.2d, $k16_k0.2d}, [x9]
rev64 $Xl.16b, $Xl.16b // byteswap Xi
ext $Xl.16b, $Xl.16b, $Xl.16b, #8
eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing
.Loop_neon:
ld1 {$INlo.16b}, [$inp], #16 // load inp
rev64 $INlo.16b, $INlo.16b // byteswap inp
ext $INlo.16b, $INlo.16b, $INlo.16b, #8
eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi
.Lgmult_neon:
// Split the input into $INlo and $INhi. (The upper halves are unused,
// so it is okay to leave them alone.)
ins $INhi.d[0], $INlo.d[1]
___
&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo
$code .= <<___;
eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing
___
&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi
$code .= <<___;
ext $t0.16b, $Xl.16b, $Xh.16b, #8
eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing
eor $Xm.16b, $Xm.16b, $Xh.16b
eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi
ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result
// This is a no-op due to the ins instruction below.
// ins $Xh.d[0], $Xm.d[1]
// equivalent of reduction_avx from ghash-x86_64.pl
shl $t1.2d, $Xl.2d, #57 // 1st phase
shl $t2.2d, $Xl.2d, #62
eor $t2.16b, $t2.16b, $t1.16b //
shl $t1.2d, $Xl.2d, #63
eor $t2.16b, $t2.16b, $t1.16b //
// Note Xm contains {Xl.d[1], Xh.d[0]}.
eor $t2.16b, $t2.16b, $Xm.16b
ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0]
ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1]
ushr $t2.2d, $Xl.2d, #1 // 2nd phase
eor $Xh.16b, $Xh.16b,$Xl.16b
eor $Xl.16b, $Xl.16b,$t2.16b //
ushr $t2.2d, $t2.2d, #6
ushr $Xl.2d, $Xl.2d, #1 //
eor $Xl.16b, $Xl.16b, $Xh.16b //
eor $Xl.16b, $Xl.16b, $t2.16b //
subs $len, $len, #16
bne .Loop_neon
rev64 $Xl.16b, $Xl.16b // byteswap Xi and write
ext $Xl.16b, $Xl.16b, $Xl.16b, #8
st1 {$Xl.16b}, [$Xi]
ret
.size gcm_ghash_neon,.-gcm_ghash_neon
.section .rodata
.align 4
.Lmasks:
.quad 0x0000ffffffffffff // k48
.quad 0x00000000ffffffff // k32
.quad 0x000000000000ffff // k16
.quad 0x0000000000000000 // k0
.asciz "GHASH for ARMv8, derived from ARMv4 version by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env perl
# Copyright (c) 2019, Google Inc.
# SPDX-License-Identifier: ISC
# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit
# table-based GHASH implementation. It requires SSSE3 instructions.
#
# For background, the table-based strategy is a 4-bit windowed multiplication.
# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
# over 4-bit windows of the input and indexes them up into the table. Visually,
# it multiplies as in the schoolbook multiplication diagram below, but with
# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
# it incorporates the terms labeled '1' by indexing the most significant term
# of X into the table. Then it shifts and repeats for '2' and so on.
#
# hhhhhh
# * xxxxxx
# ============
# 666666
# 555555
# 444444
# 333333
# 222222
# 111111
#
# This implementation changes the order. We treat the table as a 16×16 matrix
# and transpose it. The first row is then the first byte of each multiple of H,
# and so on. We then reorder terms as below. Observe that the terms labeled '1'
# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
# pshufb instruction, using alternating terms of X in parallel as indices. This
# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
# repeat for each row.
#
# hhhhhh
# * xxxxxx
# ============
# 224466
# 113355
# 224466
# 113355
# 224466
# 113355
#
# Next we account for GCM's confusing bit order. The "first" bit is the least
# significant coefficient, but GCM treats the most sigificant bit within a byte
# as first. Bytes are little-endian, and bits are big-endian. We reverse the
# bytes in XMM registers for a consistent bit and byte ordering, but this means
# the least significant bit is the most significant coefficient and vice versa.
#
# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
# bit ordering within the XMM register, rather than the reversed coefficient
# ordering. Low bits are less significant bits and more significant
# coefficients. Right-shifts move from MSB to the LSB and correspond to
# increasing the power of each coefficient.
#
# Note this bit reversal enters into the table's column indices. H*1 is stored
# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
# table rows contain more significant coefficients, so we iterate forwards.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = $ARGV[1];
open STDOUT, ">$output";
&asm_init($ARGV[0]);
my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx");
&static_label("reverse_bytes");
&static_label("low4_mask");
my $call_counter = 0;
# process_rows emits assembly code to process $rows rows of the table. On
# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the
# low and high halves of the input. The result so far is passed in xmm2. xmm3
# must be zero. On output, $Htable is advanced to the next row and xmm2 is
# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6.
sub process_rows {
my ($rows) = @_;
$call_counter++;
# Shifting whole XMM registers by bits is complex. psrldq shifts by
# bytes, and psrlq shifts the two 64-bit halves separately. Each row
# produces 8 bits of carry, and the reduction needs an additional 7-bit
# shift. This must fit in 64 bits so reduction can use psrlq. This
# allows up to 7 rows at a time.
die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
&mov("eax", $rows);
&set_label("loop_row_$call_counter");
&movdqa("xmm4", &QWP(0, $Htable));
&lea($Htable, &DWP(16, $Htable));
# Right-shift xmm2 and xmm3 by 8 bytes.
&movdqa("xmm6", "xmm2");
&palignr("xmm6", "xmm3", 1);
&movdqa("xmm3", "xmm6");
&psrldq("xmm2", 1);
# Load the next table row and index the low and high bits of the input.
# Note the low (respectively, high) half corresponds to more
# (respectively, less) significant coefficients.
&movdqa("xmm5", "xmm4");
&pshufb("xmm4", "xmm0");
&pshufb("xmm5", "xmm1");
# Add the high half (xmm5) without shifting.
&pxor("xmm2", "xmm5");
# Add the low half (xmm4). This must be right-shifted by 4 bits. First,
# add into the carry register (xmm3).
&movdqa("xmm5", "xmm4");
&psllq("xmm5", 60);
&movdqa("xmm6", "xmm5");
&pslldq("xmm6", 8);
&pxor("xmm3", "xmm6");
# Next, add into xmm2.
&psrldq("xmm5", 8);
&pxor("xmm2", "xmm5");
&psrlq("xmm4", 4);
&pxor("xmm2", "xmm4");
&sub("eax", 1);
&jnz(&label("loop_row_$call_counter"));
# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
# x^7, so we shift and XOR four times.
&pxor("xmm2", "xmm3"); # x^0 = 0
&psrlq("xmm3", 1);
&pxor("xmm2", "xmm3"); # x^1 = x
&psrlq("xmm3", 1);
&pxor("xmm2", "xmm3"); # x^(1+1) = x^2
&psrlq("xmm3", 5);
&pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7
&pxor("xmm3", "xmm3");
____
}
# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
# formatted as described above.
# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
&function_begin("gcm_gmult_ssse3");
&mov($Xi, &wparam(0));
&mov($Htable, &wparam(1));
&movdqu("xmm0", &QWP(0, $Xi));
&call(&label("pic_point"));
&set_label("pic_point");
&blindpop("eax");
&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax"));
&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax"));
# Reverse input bytes to deserialize.
&pshufb("xmm0", "xmm7");
# Split each byte into low (xmm0) and high (xmm1) halves.
&movdqa("xmm1", "xmm2");
&pandn("xmm1", "xmm0");
&psrld("xmm1", 4);
&pand("xmm0", "xmm2");
# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
# that, due to bit reversal, xmm3 contains bits that fall off when
# right-shifting, not left-shifting.
&pxor("xmm2", "xmm2");
&pxor("xmm3", "xmm3");
# We must reduce at least once every 7 rows, so divide into three
# chunks.
&process_rows(5);
&process_rows(5);
&process_rows(6);
# Store the result. Reverse bytes to serialize.
&pshufb("xmm2", "xmm7");
&movdqu(&QWP(0, $Xi), "xmm2");
# Zero any registers which contain secrets.
&pxor("xmm0", "xmm0");
&pxor("xmm1", "xmm1");
&pxor("xmm2", "xmm2");
&pxor("xmm3", "xmm3");
&pxor("xmm4", "xmm4");
&pxor("xmm5", "xmm5");
&pxor("xmm6", "xmm6");
&function_end("gcm_gmult_ssse3");
# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
# serialized byte representation. |Htable| is formatted as described above.
# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
# size_t len);
&function_begin("gcm_ghash_ssse3");
&mov($Xi, &wparam(0));
&mov($Htable, &wparam(1));
&mov($in, &wparam(2));
&mov($len, &wparam(3));
&movdqu("xmm0", &QWP(0, $Xi));
&call(&label("pic_point"));
&set_label("pic_point");
&blindpop("ebx");
&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx"));
# This function only processes whole blocks.
&and($len, -16);
# Reverse input bytes to deserialize. We maintain the running
# total in xmm0.
&pshufb("xmm0", "xmm7");
# Iterate over each block. On entry to each iteration, xmm3 is zero.
&pxor("xmm3", "xmm3");
&set_label("loop_ghash");
&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx"));
# Incorporate the next block of input.
&movdqu("xmm1", &QWP(0, $in));
&pshufb("xmm1", "xmm7"); # Reverse bytes.
&pxor("xmm0", "xmm1");
# Split each byte into low (xmm0) and high (xmm1) halves.
&movdqa("xmm1", "xmm2");
&pandn("xmm1", "xmm0");
&psrld("xmm1", 4);
&pand("xmm0", "xmm2");
# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note
# that, due to bit reversal, xmm3 contains bits that fall off when
# right-shifting, not left-shifting.
&pxor("xmm2", "xmm2");
# xmm3 is already zero at this point.
# We must reduce at least once every 7 rows, so divide into three
# chunks.
&process_rows(5);
&process_rows(5);
&process_rows(6);
&movdqa("xmm0", "xmm2");
# Rewind $Htable for the next iteration.
&lea($Htable, &DWP(-256, $Htable));
# Advance input and continue.
&lea($in, &DWP(16, $in));
&sub($len, 16);
&jnz(&label("loop_ghash"));
# Reverse bytes and store the result.
&pshufb("xmm0", "xmm7");
&movdqu(&QWP(0, $Xi), "xmm0");
# Zero any registers which contain secrets.
&pxor("xmm0", "xmm0");
&pxor("xmm1", "xmm1");
&pxor("xmm2", "xmm2");
&pxor("xmm3", "xmm3");
&pxor("xmm4", "xmm4");
&pxor("xmm5", "xmm5");
&pxor("xmm6", "xmm6");
&function_end("gcm_ghash_ssse3");
# reverse_bytes is a permutation which, if applied with pshufb, reverses the
# bytes in an XMM register.
&set_label("reverse_bytes", 16);
&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
# low4_mask is an XMM mask which selects the low four bits of each byte.
&set_label("low4_mask", 16);
&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f);
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,342 @@
#!/usr/bin/env perl
# Copyright (c) 2019, Google Inc.
#
# SPDX-License-Identifier: ISC
# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
# table-based GHASH implementation. It requires SSSE3 instructions.
#
# For background, the table-based strategy is a 4-bit windowed multiplication.
# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
# over 4-bit windows of the input and indexes them up into the table. Visually,
# it multiplies as in the schoolbook multiplication diagram below, but with
# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
# it incorporates the terms labeled '1' by indexing the most significant term
# of X into the table. Then it shifts and repeats for '2' and so on.
#
# hhhhhh
# * xxxxxx
# ============
# 666666
# 555555
# 444444
# 333333
# 222222
# 111111
#
# This implementation changes the order. We treat the table as a 16×16 matrix
# and transpose it. The first row is then the first byte of each multiple of H,
# and so on. We then reorder terms as below. Observe that the terms labeled '1'
# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
# pshufb instruction, using alternating terms of X in parallel as indices. This
# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
# repeat for each row.
#
# hhhhhh
# * xxxxxx
# ============
# 224466
# 113355
# 224466
# 113355
# 224466
# 113355
#
# Next we account for GCM's confusing bit order. The "first" bit is the least
# significant coefficient, but GCM treats the most sigificant bit within a byte
# as first. Bytes are little-endian, and bits are big-endian. We reverse the
# bytes in XMM registers for a consistent bit and byte ordering, but this means
# the least significant bit is the most significant coefficient and vice versa.
#
# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
# bit ordering within the XMM register, rather than the reversed coefficient
# ordering. Low bits are less significant bits and more significant
# coefficients. Right-shifts move from MSB to the LSB and correspond to
# increasing the power of each coefficient.
#
# Note this bit reversal enters into the table's column indices. H*1 is stored
# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
# table rows contain more significant coefficients, so we iterate forwards.
use strict;
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
my $flavour = shift;
my $output = shift;
my $win64 = 0;
$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;
my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
("%rdi", "%rsi", "%rdx", "%rcx");
my $code = <<____;
.text
# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
# formatted as described above.
# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
.type gcm_gmult_ssse3, \@abi-omnipotent
.globl gcm_gmult_ssse3
.align 16
gcm_gmult_ssse3:
.cfi_startproc
.seh_startproc
_CET_ENDBR
____
$code .= <<____ if ($win64);
subq \$40, %rsp
.seh_allocstack 40
movdqa %xmm6, (%rsp)
.seh_savexmm128 %xmm6, 0
movdqa %xmm10, 16(%rsp)
.seh_savexmm128 %xmm10, 16
____
$code .= <<____;
movdqu ($Xi), %xmm0
movdqa .Lreverse_bytes(%rip), %xmm10
movdqa .Llow4_mask(%rip), %xmm2
# Reverse input bytes to deserialize.
pshufb %xmm10, %xmm0
# Split each byte into low (%xmm0) and high (%xmm1) halves.
movdqa %xmm2, %xmm1
pandn %xmm0, %xmm1
psrld \$4, %xmm1
pand %xmm2, %xmm0
# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
# that, due to bit reversal, %xmm3 contains bits that fall off when
# right-shifting, not left-shifting.
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
____
my $call_counter = 0;
# process_rows returns assembly code to process $rows rows of the table. On
# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
sub process_rows {
my ($rows) = @_;
$call_counter++;
# Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
# and psrlq shifts the two 64-bit halves separately. Each row produces 8
# bits of carry, and the reduction needs an additional 7-bit shift. This
# must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
# at a time.
die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
return <<____;
movq \$$rows, %rax
.Loop_row_$call_counter:
movdqa ($Htable), %xmm4
leaq 16($Htable), $Htable
# Right-shift %xmm2 and %xmm3 by 8 bytes.
movdqa %xmm2, %xmm6
palignr \$1, %xmm3, %xmm6
movdqa %xmm6, %xmm3
psrldq \$1, %xmm2
# Load the next table row and index the low and high bits of the input.
# Note the low (respectively, high) half corresponds to more
# (respectively, less) significant coefficients.
movdqa %xmm4, %xmm5
pshufb %xmm0, %xmm4
pshufb %xmm1, %xmm5
# Add the high half (%xmm5) without shifting.
pxor %xmm5, %xmm2
# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
# add into the carry register (%xmm3).
movdqa %xmm4, %xmm5
psllq \$60, %xmm5
movdqa %xmm5, %xmm6
pslldq \$8, %xmm6
pxor %xmm6, %xmm3
# Next, add into %xmm2.
psrldq \$8, %xmm5
pxor %xmm5, %xmm2
psrlq \$4, %xmm4
pxor %xmm4, %xmm2
subq \$1, %rax
jnz .Loop_row_$call_counter
# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
# x^7, so we shift and XOR four times.
pxor %xmm3, %xmm2 # x^0 = 0
psrlq \$1, %xmm3
pxor %xmm3, %xmm2 # x^1 = x
psrlq \$1, %xmm3
pxor %xmm3, %xmm2 # x^(1+1) = x^2
psrlq \$5, %xmm3
pxor %xmm3, %xmm2 # x^(1+1+5) = x^7
pxor %xmm3, %xmm3
____
}
# We must reduce at least once every 7 rows, so divide into three chunks.
$code .= process_rows(5);
$code .= process_rows(5);
$code .= process_rows(6);
$code .= <<____;
# Store the result. Reverse bytes to serialize.
pshufb %xmm10, %xmm2
movdqu %xmm2, ($Xi)
# Zero any registers which contain secrets.
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
____
$code .= <<____ if ($win64);
movdqa (%rsp), %xmm6
movdqa 16(%rsp), %xmm10
addq \$40, %rsp
____
$code .= <<____;
ret
.cfi_endproc
.seh_endproc
.size gcm_gmult_ssse3,.-gcm_gmult_ssse3
____
$code .= <<____;
# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
# serialized byte representation. |Htable| is formatted as described above.
# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
# size_t len);
.type gcm_ghash_ssse3, \@abi-omnipotent
.globl gcm_ghash_ssse3
.align 16
gcm_ghash_ssse3:
.cfi_startproc
.seh_startproc
_CET_ENDBR
____
$code .= <<____ if ($win64);
subq \$56, %rsp
.seh_allocstack 56
movdqa %xmm6, (%rsp)
.seh_savexmm128 %xmm6, 0
movdqa %xmm10, 16(%rsp)
.seh_savexmm128 %xmm10, 16
movdqa %xmm11, 32(%rsp)
.seh_savexmm128 %xmm11, 32
____
$code .= <<____;
movdqu ($Xi), %xmm0
movdqa .Lreverse_bytes(%rip), %xmm10
movdqa .Llow4_mask(%rip), %xmm11
# This function only processes whole blocks.
andq \$-16, $len
# Reverse input bytes to deserialize. We maintain the running
# total in %xmm0.
pshufb %xmm10, %xmm0
# Iterate over each block. On entry to each iteration, %xmm3 is zero.
pxor %xmm3, %xmm3
.Loop_ghash:
# Incorporate the next block of input.
movdqu ($in), %xmm1
pshufb %xmm10, %xmm1 # Reverse bytes.
pxor %xmm1, %xmm0
# Split each byte into low (%xmm0) and high (%xmm1) halves.
movdqa %xmm11, %xmm1
pandn %xmm0, %xmm1
psrld \$4, %xmm1
pand %xmm11, %xmm0
# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
# that, due to bit reversal, %xmm3 contains bits that fall off when
# right-shifting, not left-shifting.
pxor %xmm2, %xmm2
# %xmm3 is already zero at this point.
____
# We must reduce at least once every 7 rows, so divide into three chunks.
$code .= process_rows(5);
$code .= process_rows(5);
$code .= process_rows(6);
$code .= <<____;
movdqa %xmm2, %xmm0
# Rewind $Htable for the next iteration.
leaq -256($Htable), $Htable
# Advance input and continue.
leaq 16($in), $in
subq \$16, $len
jnz .Loop_ghash
# Reverse bytes and store the result.
pshufb %xmm10, %xmm0
movdqu %xmm0, ($Xi)
# Zero any registers which contain secrets.
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
pxor %xmm2, %xmm2
pxor %xmm3, %xmm3
pxor %xmm4, %xmm4
pxor %xmm5, %xmm5
pxor %xmm6, %xmm6
____
$code .= <<____ if ($win64);
movdqa (%rsp), %xmm6
movdqa 16(%rsp), %xmm10
movdqa 32(%rsp), %xmm11
addq \$56, %rsp
____
$code .= <<____;
ret
.cfi_endproc
.seh_endproc
.size gcm_ghash_ssse3,.-gcm_ghash_ssse3
.section .rodata
.align 16
# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
# bytes in an XMM register.
.Lreverse_bytes:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
.Llow4_mask:
.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
.text
____
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,684 @@
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# March, May, June 2010
#
# The module implements "4-bit" GCM GHASH function and underlying
# single multiplication operation in GF(2^128). "4-bit" means that it
# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
# code paths: vanilla x86 and vanilla SSE. Former will be executed on
# 486 and Pentium, latter on all others. SSE GHASH features so called
# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
# of per-key storage [+512 bytes shared table]. Performance results
# are for streamed GHASH subroutine and are expressed in cycles per
# processed byte, less is better:
#
# gcc 2.95.3(*) SSE assembler x86 assembler
#
# Pentium 105/111(**) - 50
# PIII 68 /75 12.2 24
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
# Atom 105/105 16.8 53
# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# another reason is lack of 3.4.x results for older CPUs;
# comparison with SSE results is not completely fair, because C
# results are for vanilla "256B" implementation, while
# assembler results are for "528B";-)
# (**) second number is result for code compiled with -fPIC flag,
# which is actually more relevant, because assembler code is
# position-independent;
# (***) see comment in non-MMX routine for further details;
#
# To summarize, it's >2-5 times faster than gcc-generated code. To
# anchor it to something else SHA1 assembler processes one byte in
# ~7 cycles on contemporary x86 cores. As for choice of MMX/SSE
# in particular, see comment at the end of the file...
# May 2010
#
# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
# The question is how close is it to theoretical limit? The pclmulqdq
# instruction latency appears to be 14 cycles and there can't be more
# than 2 of them executing at any given time. This means that single
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
# pre- and post-processing. Then multiplication has to be followed by
# modulo-reduction. Given that aggregated reduction method [see
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
# white paper by Intel] allows you to perform reduction only once in
# a while we can assume that asymptotic performance can be estimated
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
# and Naggr is the aggregation factor.
#
# Before we proceed to this implementation let's have closer look at
# the best-performing code suggested by Intel in their white paper.
# By tracing inter-register dependencies Tmod is estimated as ~19
# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
# processed byte. As implied, this is quite optimistic estimate,
# because it does not account for Karatsuba pre- and post-processing,
# which for a single multiplication is ~5 cycles. Unfortunately Intel
# does not provide performance data for GHASH alone. But benchmarking
# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
# the result accounts even for pre-computing of degrees of the hash
# key H, but its portion is negligible at 16KB buffer size.
#
# Moving on to the implementation in question. Tmod is estimated as
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
# 2.16. How is it possible that measured performance is better than
# optimistic theoretical estimate? There is one thing Intel failed
# to recognize. By serializing GHASH with CTR in same subroutine
# former's performance is really limited to above (Tmul + Tmod/Naggr)
# equation. But if GHASH procedure is detached, the modulo-reduction
# can be interleaved with Naggr-1 multiplications at instruction level
# and under ideal conditions even disappear from the equation. So that
# optimistic theoretical estimate for this implementation is ...
# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
# where Tproc is time required for Karatsuba pre- and post-processing,
# is more realistic estimate. In this case it gives ... 1.91 cycles.
# Or in other words, depending on how well we can interleave reduction
# and one of the two multiplications the performance should be between
# 1.91 and 2.16. As already mentioned, this implementation processes
# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
# - in 2.02. x86_64 performance is better, because larger register
# bank allows to interleave reduction and multiplication better.
#
# Does it make sense to increase Naggr? To start with it's virtually
# impossible in 32-bit mode, because of limited register bank
# capacity. Otherwise improvement has to be weighed against slower
# setup, as well as code size and complexity increase. As even
# optimistic estimate doesn't promise 30% performance improvement,
# there are currently no plans to increase Naggr.
#
# Special thanks to David Woodhouse for providing access to a
# Westmere-based system on behalf of Intel Open Source Technology Centre.
# January 2010
#
# Tweaked to optimize transitions between integer and FP operations
# on same XMM register, PCLMULQDQ subroutine was measured to process
# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
# The minor regression on Westmere is outweighed by ~15% improvement
# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
#####################################################################
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
# 32-bit mode and 1.89 in 64-bit.
# February 2013
#
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
# reduction_alg9. Resulting performance is 1.96 cycles per byte on
# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer.
# This file was patched in BoringSSL to remove the variable-time 4-bit
# implementation.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output=$ARGV[1];
open STDOUT,">$output";
&asm_init($ARGV[0]);
$x86only=0;
$sse2=1;
if (!$x86only) {{{
if ($sse2) {{
######################################################################
# PCLMULQDQ version.
$Xip="eax";
$Htbl="edx";
$const="ecx";
$inp="esi";
$len="ebx";
($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
($Xn,$Xhn)=("xmm6","xmm7");
&static_label("bswap");
sub clmul64x64_T2 { # minimal "register" pressure
my ($Xhi,$Xi,$Hkey,$HK)=@_;
&movdqa ($Xhi,$Xi); #
&pshufd ($T1,$Xi,0b01001110);
&pshufd ($T2,$Hkey,0b01001110) if (!defined($HK));
&pxor ($T1,$Xi); #
&pxor ($T2,$Hkey) if (!defined($HK));
$HK=$T2 if (!defined($HK));
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T1,$HK,0x00); #######
&xorps ($T1,$Xi); #
&xorps ($T1,$Xhi); #
&movdqa ($T2,$T1); #
&psrldq ($T1,8);
&pslldq ($T2,8); #
&pxor ($Xhi,$T1);
&pxor ($Xi,$T2); #
}
sub clmul64x64_T3 {
# Even though this subroutine offers visually better ILP, it
# was empirically found to be a tad slower than above version.
# At least in gcm_ghash_clmul context. But it's just as well,
# because loop modulo-scheduling is possible only thanks to
# minimized "register" pressure...
my ($Xhi,$Xi,$Hkey)=@_;
&movdqa ($T1,$Xi); #
&movdqa ($Xhi,$Xi);
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pshufd ($T2,$T1,0b01001110); #
&pshufd ($T3,$Hkey,0b01001110);
&pxor ($T2,$T1); #
&pxor ($T3,$Hkey);
&pclmulqdq ($T2,$T3,0x00); #######
&pxor ($T2,$Xi); #
&pxor ($T2,$Xhi); #
&movdqa ($T3,$T2); #
&psrldq ($T2,8);
&pslldq ($T3,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T3); #
}
if (1) { # Algorithm 9 with <<1 twist.
# Reduction is shorter and uses only two
# temporary registers, which makes it better
# candidate for interleaving with 64x64
# multiplication. Pre-modulo-scheduled loop
# was found to be ~20% faster than Algorithm 5
# below. Algorithm 9 was therefore chosen for
# further optimization...
sub reduction_alg9 { # 17/11 times faster than Intel version
my ($Xhi,$Xi) = @_;
# 1st phase
&movdqa ($T2,$Xi); #
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
# 2nd phase
&movdqa ($T2,$Xi);
&psrlq ($Xi,1);
&pxor ($Xhi,$T2); #
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
}
&function_begin_B("gcm_init_clmul");
&mov ($Htbl,&wparam(0));
&mov ($Xip,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Hkey,&QWP(0,$Xip));
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
# <<1 twist
&pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
&movdqa ($T1,$Hkey);
&psllq ($Hkey,1);
&pxor ($T3,$T3); #
&psrlq ($T1,63);
&pcmpgtd ($T3,$T2); # broadcast carry bit
&pslldq ($T1,8);
&por ($Hkey,$T1); # H<<=1
# magic reduction
&pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
&pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
# calculate H^2
&movdqa ($Xi,$Hkey);
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
&reduction_alg9 ($Xhi,$Xi);
&pshufd ($T1,$Hkey,0b01001110);
&pshufd ($T2,$Xi,0b01001110);
&pxor ($T1,$Hkey); # Karatsuba pre-processing
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
&pxor ($T2,$Xi); # Karatsuba pre-processing
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
&palignr ($T2,$T1,8); # low part is H.lo^H.hi
&movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt"
&ret ();
&function_end_B("gcm_init_clmul");
&function_begin_B("gcm_gmult_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movups ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&movups ($T2,&QWP(32,$Htbl));
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
&reduction_alg9 ($Xhi,$Xi);
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&ret ();
&function_end_B("gcm_gmult_clmul");
&function_begin("gcm_ghash_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&mov ($inp,&wparam(2));
&mov ($len,&wparam(3));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&sub ($len,0x10);
&jz (&label("odd_tail"));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&movdqu ($T3,&QWP(32,$Htbl));
&pxor ($Xi,$T1); # Ii+Xi
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
&movdqa ($Xhn,$Xn);
&pxor ($T1,$Xn); #
&lea ($inp,&DWP(32,$inp)); # i+=2
&pclmulqdq ($Xn,$Hkey,0x00); #######
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&pclmulqdq ($T1,$T3,0x00); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&nop ();
&sub ($len,0x20);
&jbe (&label("even_tail"));
&jmp (&label("mod_loop"));
&set_label("mod_loop",32);
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&nop ();
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xhi,$Xhn);
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pxor ($T1,$Xhi); #
&pshufb ($Xhn,$T3);
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&pshufb ($Xn,$T3);
&pxor ($Xhi,$Xhn); # "Ii+Xi", consume early
&movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
&movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase
&movdqa ($T1,$Xi);
&psllq ($Xi,5);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
&pclmulqdq ($Xn,$Hkey,0x00); #######
&movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);
&movdqa ($T2,$Xi); # 2nd phase
&psrlq ($Xi,1);
&pxor ($T1,$Xhn);
&pxor ($Xhi,$T2); #
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
&psrlq ($Xi,1); #
&pxor ($Xi,$Xhi) #
&pclmulqdq ($T1,$T3,0x00); #######
&lea ($inp,&DWP(32,$inp));
&sub ($len,0x20);
&ja (&label("mod_loop"));
&set_label("even_tail");
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
&pclmulqdq ($T2,$T3,0x10); #######
&movdqa ($T3,&QWP(0,$const));
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&xorps ($Xhi,$Xhn);
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&pxor ($T1,$Xhi); #
&pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pslldq ($T1,8); #
&pxor ($Xhi,$T2);
&pxor ($Xi,$T1); #
&reduction_alg9 ($Xhi,$Xi);
&test ($len,$len);
&jnz (&label("done"));
&movups ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg9 ($Xhi,$Xi);
&set_label("done");
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("gcm_ghash_clmul");
} else { # Algorithm 5. Kept for reference purposes.
sub reduction_alg5 { # 19/16 times faster than Intel version
my ($Xhi,$Xi)=@_;
# <<1
&movdqa ($T1,$Xi); #
&movdqa ($T2,$Xhi);
&pslld ($Xi,1);
&pslld ($Xhi,1); #
&psrld ($T1,31);
&psrld ($T2,31); #
&movdqa ($T3,$T1);
&pslldq ($T1,4);
&psrldq ($T3,12); #
&pslldq ($T2,4);
&por ($Xhi,$T3); #
&por ($Xi,$T1);
&por ($Xhi,$T2); #
# 1st phase
&movdqa ($T1,$Xi);
&movdqa ($T2,$Xi);
&movdqa ($T3,$Xi); #
&pslld ($T1,31);
&pslld ($T2,30);
&pslld ($Xi,25); #
&pxor ($T1,$T2);
&pxor ($T1,$Xi); #
&movdqa ($T2,$T1); #
&pslldq ($T1,12);
&psrldq ($T2,4); #
&pxor ($T3,$T1);
# 2nd phase
&pxor ($Xhi,$T3); #
&movdqa ($Xi,$T3);
&movdqa ($T1,$T3);
&psrld ($Xi,1); #
&psrld ($T1,2);
&psrld ($T3,7); #
&pxor ($Xi,$T1);
&pxor ($Xhi,$T2);
&pxor ($Xi,$T3); #
&pxor ($Xi,$Xhi); #
}
&function_begin_B("gcm_init_clmul");
&mov ($Htbl,&wparam(0));
&mov ($Xip,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Hkey,&QWP(0,$Xip));
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
# calculate H^2
&movdqa ($Xi,$Hkey);
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
&reduction_alg5 ($Xhi,$Xi);
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
&ret ();
&function_end_B("gcm_init_clmul");
&function_begin_B("gcm_gmult_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($Xn,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$Xn);
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
&reduction_alg5 ($Xhi,$Xi);
&pshufb ($Xi,$Xn);
&movdqu (&QWP(0,$Xip),$Xi);
&ret ();
&function_end_B("gcm_gmult_clmul");
&function_begin("gcm_ghash_clmul");
&mov ($Xip,&wparam(0));
&mov ($Htbl,&wparam(1));
&mov ($inp,&wparam(2));
&mov ($len,&wparam(3));
&call (&label("pic"));
&set_label("pic");
&blindpop ($const);
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
&movdqu ($Xi,&QWP(0,$Xip));
&movdqa ($T3,&QWP(0,$const));
&movdqu ($Hkey,&QWP(0,$Htbl));
&pshufb ($Xi,$T3);
&sub ($len,0x10);
&jz (&label("odd_tail"));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
&sub ($len,0x20);
&lea ($inp,&DWP(32,$inp)); # i+=2
&jbe (&label("even_tail"));
&set_label("mod_loop");
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&pxor ($Xhi,$Xhn);
&reduction_alg5 ($Xhi,$Xi);
#######
&movdqa ($T3,&QWP(0,$const));
&movdqu ($T1,&QWP(0,$inp)); # Ii
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pshufb ($T1,$T3);
&pshufb ($Xn,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
&sub ($len,0x20);
&lea ($inp,&DWP(32,$inp));
&ja (&label("mod_loop"));
&set_label("even_tail");
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
&pxor ($Xhi,$Xhn);
&reduction_alg5 ($Xhi,$Xi);
&movdqa ($T3,&QWP(0,$const));
&test ($len,$len);
&jnz (&label("done"));
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
&set_label("odd_tail");
&movdqu ($T1,&QWP(0,$inp)); # Ii
&pshufb ($T1,$T3);
&pxor ($Xi,$T1); # Ii+Xi
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
&reduction_alg5 ($Xhi,$Xi);
&movdqa ($T3,&QWP(0,$const));
&set_label("done");
&pshufb ($Xi,$T3);
&movdqu (&QWP(0,$Xip),$Xi);
&function_end("gcm_ghash_clmul");
}
&set_label("bswap",64);
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
}} # $sse2
}}} # !$x86only
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";
# A question was risen about choice of vanilla MMX. Or rather why wasn't
# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
# CPUs such as PIII, "4-bit" MMX version was observed to provide better
# performance than *corresponding* SSE2 one even on contemporary CPUs.
# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
# implementation featuring full range of lookup-table sizes, but with
# per-invocation lookup table setup. Latter means that table size is
# chosen depending on how much data is to be hashed in every given call,
# more data - larger table. Best reported result for Core2 is ~4 cycles
# per processed byte out of 64KB block. This number accounts even for
# 64KB table setup overhead. As discussed in gcm128.c we choose to be
# more conservative in respect to lookup table sizes, but how do the
# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
# on same platform. As also discussed in gcm128.c, next in line "8-bit
# Shoup's" or "4KB" method should deliver twice the performance of
# "256B" one, in other words not worse than ~6 cycles per byte. It
# should be also be noted that in SSE2 case improvement can be "super-
# linear," i.e. more than twice, mostly because >>8 maps to single
# instruction on SSE2 register. This is unlike "4-bit" case when >>4
# maps to same amount of instructions in both MMX and SSE2 cases.
# Bottom line is that switch to SSE2 is considered to be justifiable
# only in case we choose to implement "8-bit" method...

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,669 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# GHASH for for PowerISA v2.07.
#
# July 2014
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This initial
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
# faster than "4-bit" integer-only compiler-generated 64-bit code.
# "Initial version" means that there is room for futher improvement.
# May 2016
#
# 2x aggregated reduction improves performance by 50% (resulting
# performance on POWER8 is 1 cycle per processed byte), and 4x
# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
$UCMP="cmpld";
$SHRI="srdi";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
$UCMP="cmplw";
$SHRI="srwi";
} else { die "nonsense $flavour"; }
$sp="r1";
$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open OUT,"| $^X \"$xlate\" $flavour \"$output\"" || die "can't call $xlate: $!";
*STDOUT=*OUT;
my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
my $vrsave="r12";
$code=<<___;
.machine "any"
.text
.globl .gcm_init_p8
.align 5
.gcm_init_p8:
li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $H,0,r4 # load H
vspltisb $xC2,-16 # 0xf0
vspltisb $t0,1 # one
vaddubm $xC2,$xC2,$xC2 # 0xe0
vxor $zero,$zero,$zero
vor $xC2,$xC2,$t0 # 0xe1
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
vsldoi $t1,$zero,$t0,1 # ...1
vaddubm $xC2,$xC2,$xC2 # 0xc2...
vspltisb $t2,7
vor $xC2,$xC2,$t1 # 0xc2....01
vspltb $t1,$H,0 # most significant byte
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
vxor $IN,$H,$t1 # twisted H
vsldoi $H,$IN,$IN,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
li r8,0x40
stvx_u $H, r9,r3
li r9,0x50
stvx_u $Hh,r10,r3
li r10,0x60
vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $IN1,$Xl,$t1
vsldoi $H2,$IN1,$IN1,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $H2l,r8,r3 # save H^2
li r8,0x70
stvx_u $H2,r9,r3
li r9,0x80
stvx_u $H2h,r10,r3
li r10,0x90
___
{
my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
$code.=<<___;
vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vsldoi $t4,$Xm1,$zero,8
vsldoi $t5,$zero,$Xm1,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vxor $Xl1,$Xl1,$t4
vxor $Xh1,$Xh1,$t5
vsldoi $Xl,$Xl,$Xl,8
vsldoi $Xl1,$Xl1,$Xl1,8
vxor $Xl,$Xl,$t2
vxor $Xl1,$Xl1,$t6
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vpmsumd $Xl1,$Xl1,$xC2
vxor $t1,$t1,$Xh
vxor $t5,$t5,$Xh1
vxor $Xl,$Xl,$t1
vxor $Xl1,$Xl1,$t5
vsldoi $H,$Xl,$Xl,8
vsldoi $H2,$Xl1,$Xl1,8
vsldoi $Hl,$zero,$H,8
vsldoi $Hh,$H,$zero,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $Hl,r8,r3 # save H^3
li r8,0xa0
stvx_u $H,r9,r3
li r9,0xb0
stvx_u $Hh,r10,r3
li r10,0xc0
stvx_u $H2l,r8,r3 # save H^4
stvx_u $H2,r9,r3
stvx_u $H2h,r10,r3
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_p8,.-.gcm_init_p8
___
}
$code.=<<___;
.globl .gcm_gmult_p8
.align 5
.gcm_gmult_p8:
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $IN,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $IN,$IN,$IN,$lemask
vxor $zero,$zero,$zero
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_gmult_p8,.-.gcm_gmult_p8
.globl .gcm_ghash_p8
.align 5
.gcm_ghash_p8:
li r0,-4096
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $Xl,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
li r8,0x40
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
li r9,0x50
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
li r10,0x60
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $Xl,$Xl,$Xl,$lemask
vxor $zero,$zero,$zero
${UCMP}i $len,64
bge Lgcm_ghash_p8_4x
lvx_u $IN,0,$inp
addi $inp,$inp,16
subic. $len,$len,16
le?vperm $IN,$IN,$IN,$lemask
vxor $IN,$IN,$Xl
beq Lshort
lvx_u $H2l,r8,$Htbl # load H^2
li r8,16
lvx_u $H2, r9,$Htbl
add r9,$inp,$len # end of input
lvx_u $H2h,r10,$Htbl
be?b Loop_2x
.align 5
Loop_2x:
lvx_u $IN1,0,$inp
le?vperm $IN1,$IN1,$IN1,$lemask
subic $len,$len,32
vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
subfe r0,r0,r0 # borrow?-1:0
vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
and r0,r0,$len
vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
add $inp,$inp,r0
vxor $Xl,$Xl,$Xl1
vxor $Xm,$Xm,$Xm1
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xh,$Xh,$Xh1
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
lvx_u $IN,r8,$inp
addi $inp,$inp,32
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
le?vperm $IN,$IN,$IN,$lemask
vxor $t1,$t1,$Xh
vxor $IN,$IN,$t1
vxor $IN,$IN,$Xl
$UCMP r9,$inp
bgt Loop_2x # done yet?
cmplwi $len,0
bne Leven
Lshort:
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
Leven:
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
___
{
my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
$Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
my $IN0=$IN;
my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
$code.=<<___;
.align 5
.gcm_ghash_p8_4x:
Lgcm_ghash_p8_4x:
$STU $sp,-$FRAME($sp)
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
stvx v20,r10,$sp
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
li r10,0x60
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME-4`($sp) # save vrsave
mtspr 256,r0 # preserve all AltiVec registers
lvsl $t0,0,r8 # 0x0001..0e0f
#lvx_u $H2l,r8,$Htbl # load H^2
li r8,0x70
lvx_u $H2, r9,$Htbl
li r9,0x80
vspltisb $t1,8 # 0x0808..0808
#lvx_u $H2h,r10,$Htbl
li r10,0x90
lvx_u $H3l,r8,$Htbl # load H^3
li r8,0xa0
lvx_u $H3, r9,$Htbl
li r9,0xb0
lvx_u $H3h,r10,$Htbl
li r10,0xc0
lvx_u $H4l,r8,$Htbl # load H^4
li r8,0x10
lvx_u $H4, r9,$Htbl
li r9,0x20
lvx_u $H4h,r10,$Htbl
li r10,0x30
vsldoi $t2,$zero,$t1,8 # 0x0000..0808
vaddubm $hiperm,$t0,$t2 # 0x0001..1617
vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
$SHRI $len,$len,4 # this allows to use sign bit
# as carry
lvx_u $IN0,0,$inp # load input
lvx_u $IN1,r8,$inp
subic. $len,$len,8
lvx_u $IN2,r9,$inp
lvx_u $IN3,r10,$inp
addi $inp,$inp,0x40
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
le?vperm $IN3,$IN3,$IN3,$lemask
vxor $Xh,$IN0,$Xl
vpmsumd $Xl1,$IN1,$H3l
vpmsumd $Xm1,$IN1,$H3
vpmsumd $Xh1,$IN1,$H3h
vperm $H21l,$H2,$H,$hiperm
vperm $t0,$IN2,$IN3,$loperm
vperm $H21h,$H2,$H,$loperm
vperm $t1,$IN2,$IN3,$hiperm
vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
vxor $Xm2,$Xm2,$Xm1
vxor $Xl3,$Xl3,$Xl1
vxor $Xm3,$Xm3,$Xm2
vxor $Xh3,$Xh3,$Xh1
blt Ltail_4x
Loop_4x:
lvx_u $IN0,0,$inp
lvx_u $IN1,r8,$inp
subic. $len,$len,4
lvx_u $IN2,r9,$inp
lvx_u $IN3,r10,$inp
addi $inp,$inp,0x40
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
le?vperm $IN3,$IN3,$IN3,$lemask
le?vperm $IN0,$IN0,$IN0,$lemask
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
vpmsumd $Xl1,$IN1,$H3l
vpmsumd $Xm1,$IN1,$H3
vpmsumd $Xh1,$IN1,$H3h
vxor $Xl,$Xl,$Xl3
vxor $Xm,$Xm,$Xm3
vxor $Xh,$Xh,$Xh3
vperm $t0,$IN2,$IN3,$loperm
vperm $t1,$IN2,$IN3,$hiperm
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
vpmsumd $Xl,$Xl,$xC2
vxor $Xl3,$Xl3,$Xl1
vxor $Xh3,$Xh3,$Xh1
vxor $Xh,$Xh,$IN0
vxor $Xm2,$Xm2,$Xm1
vxor $Xh,$Xh,$t1
vxor $Xm3,$Xm3,$Xm2
vxor $Xh,$Xh,$Xl
bge Loop_4x
Ltail_4x:
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
vxor $Xl,$Xl,$Xl3
vxor $Xm,$Xm,$Xm3
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xh,$Xh,$Xh3
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
addic. $len,$len,4
beq Ldone_4x
lvx_u $IN0,0,$inp
${UCMP}i $len,2
li $len,-4
blt Lone
lvx_u $IN1,r8,$inp
beq Ltwo
Lthree:
lvx_u $IN2,r9,$inp
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
le?vperm $IN2,$IN2,$IN2,$lemask
vxor $Xh,$IN0,$Xl
vmr $H4l,$H3l
vmr $H4, $H3
vmr $H4h,$H3h
vperm $t0,$IN1,$IN2,$loperm
vperm $t1,$IN1,$IN2,$hiperm
vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
vxor $Xm3,$Xm3,$Xm2
b Ltail_4x
.align 4
Ltwo:
le?vperm $IN0,$IN0,$IN0,$lemask
le?vperm $IN1,$IN1,$IN1,$lemask
vxor $Xh,$IN0,$Xl
vperm $t0,$zero,$IN1,$loperm
vperm $t1,$zero,$IN1,$hiperm
vsldoi $H4l,$zero,$H2,8
vmr $H4, $H2
vsldoi $H4h,$H2,$zero,8
vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
b Ltail_4x
.align 4
Lone:
le?vperm $IN0,$IN0,$IN0,$lemask
vsldoi $H4l,$zero,$H,8
vmr $H4, $H
vsldoi $H4h,$H,$zero,8
vxor $Xh,$IN0,$Xl
vxor $Xl3,$Xl3,$Xl3
vxor $Xm3,$Xm3,$Xm3
vxor $Xh3,$Xh3,$Xh3
b Ltail_4x
Ldone_4x:
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mtspr 256,$vrsave
lvx v20,r10,$sp
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,0x04,0,0x80,0,4,0
.long 0
___
}
$code.=<<___;
.size .gcm_ghash_p8,.-.gcm_ghash_p8
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o;
} else {
s/le\?/#le#/o or
s/be\?//o;
}
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,883 @@
#! /usr/bin/env perl
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
#
# June 2014
# Initial version was developed in tight cooperation with Ard Biesheuvel
# of Linaro from bits-n-pieces from other assembly modules. Just like
# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
#
# July 2014
# Implement 2x aggregated reduction [see ghash-x86.pl for background
# information].
#
# November 2017
#
# AArch64 register bank to "accommodate" 4x aggregated reduction and
# improve performance by 20-70% depending on processor.
#
# Current performance in cycles per processed byte:
#
# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
# Apple A7 0.58 0.92 5.62
# Cortex-A53 0.85 1.01 8.39
# Cortex-A57 0.73 1.17 7.61
# Denver 0.51 0.65 6.02
# Mongoose 0.65 1.10 8.06
# Kryo 0.76 1.16 8.00
#
# (*) presented for reference/comparison purposes;
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$Xi="x0"; # argument block
$Htbl="x1";
$inp="x2";
$len="x3";
$inc="x12";
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
$code=<<___;
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.fpu neon
.code 32
#undef __thumb2__
___
################################################################################
# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
#
# input: 128-bit H - secret parameter E(K,0^128)
# output: precomputed table filled with degrees of twisted H;
# H is twisted to handle reverse bitness of GHASH;
# only few of 16 slots of Htable[16] are used;
# data is opaque to outside world (which allows to
# optimize the code independently);
#
$code.=<<___;
.global gcm_init_v8
.type gcm_init_v8,%function
.align 4
gcm_init_v8:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[x1] @ load input H
vmov.i8 $xC2,#0xe1
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
vext.8 $IN,$t1,$t1,#8
vshr.u64 $t2,$xC2,#63
vdup.32 $t1,${t1}[1]
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
vshr.u64 $t2,$IN,#63
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
vand $t2,$t2,$t0
vshl.i64 $IN,$IN,#1
vext.8 $t2,$t2,$t2,#8
vand $t0,$t0,$t1
vorr $IN,$IN,$t2 @ H<<<=1
veor $H,$IN,$t0 @ twisted H
vext.8 $H, $H, $H, #8
vst1.64 {$H},[x0],#16 @ store Htable[0]
@ calculate H^2
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
vpmull2.p64 $Xl,$H,$H
veor $t0,$t0,$H
vpmull.p64 $Xh,$H,$H
vpmull.p64 $Xm,$t0,$t0
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $t1,$Xl,$t2
vext.8 $H2,$t1,$t1,#8 @ Karatsuba pre-processing
veor $t1,$t1,$H2
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$Hhl},[x0],#16 @ store Htable[1..2]
vst1.64 {$H2},[x0],#16 @ store Htable[1..2]
___
if ($flavour =~ /64/) {
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
$code.=<<___;
@ calculate H^3 and H^4
vpmull2.p64 $Xl,$H, $H2
vpmull2.p64 $Yl,$H2,$H2
vpmull.p64 $Xh,$H, $H2
vpmull.p64 $Yh,$H2,$H2
vpmull.p64 $Xm,$t0,$t1
vpmull.p64 $Ym,$t1,$t1
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $t0, $Xl,$t2 @ H^3
veor $t1, $Yl,$t3 @ H^4
vext.8 $H3,$t0,$t0,#8 @ Karatsuba pre-processing
vext.8 $H4,$t1,$t1,#8
vext.8 $t2,$H2,$H2,#8
veor $t0,$t0,$H3
veor $t1,$t1,$H4
veor $t2,$t2,$H2
vext.8 $H34k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H3-$H4},[x0],#48 @ store Htable[3..5]
@ calculate H^5 and H^6
vpmull2.p64 $Xl,$H2, $H3
vpmull2.p64 $Yl,$H3,$H3
vpmull.p64 $Xh,$H2, $H3
vpmull.p64 $Yh,$H3,$H3
vpmull.p64 $Xm,$t0,$t2
vpmull.p64 $Ym,$t0,$t0
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $t0,$Xl,$t2 @ H^5
veor $t1,$Yl,$t3 @ H^6
vext.8 $H5, $t0, $t0,#8 @ Karatsuba pre-processing
vext.8 $H6, $t1, $t1,#8
vext.8 $t2,$H2,$H2,#8
veor $t0,$t0,$H5
veor $t1,$t1,$H6
veor $t2,$t2,$H2
vext.8 $H56k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H5-$H6},[x0],#48 @ store Htable[6..8]
@ calculate H^7 and H^8
vpmull2.p64 $Xl,$H2,$H5
vpmull2.p64 $Yl,$H2,$H6
vpmull.p64 $Xh,$H2,$H5
vpmull.p64 $Yh,$H2,$H6
vpmull.p64 $Xm,$t0,$t2
vpmull.p64 $Ym,$t1,$t2
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
vext.8 $t1,$Yl,$Yh,#8
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t0
veor $t3,$Yl,$Yh
veor $Ym,$Ym,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
veor $Ym,$Ym,$t3
vpmull.p64 $t3,$Yl,$xC2
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Yh#lo,$Ym#hi
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vmov $Ym#hi,$Yl#lo
veor $Xl,$Xm,$t2
veor $Yl,$Ym,$t3
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
vext.8 $t3,$Yl,$Yl,#8
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $Yl,$Yl,$xC2
veor $t2,$t2,$Xh
veor $t3,$t3,$Yh
veor $t0,$Xl,$t2 @ H^7
veor $t1,$Yl,$t3 @ H^8
vext.8 $H7,$t0,$t0,#8 @ Karatsuba pre-processing
vext.8 $H8,$t1,$t1,#8
veor $t0,$t0,$H7
veor $t1,$t1,$H8
vext.8 $H78k,$t0,$t1,#8 @ pack Karatsuba pre-processed
vst1.64 {$H7-$H8},[x0] @ store Htable[9..11]
___
}
$code.=<<___;
ret
.size gcm_init_v8,.-gcm_init_v8
___
################################################################################
# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
#
# input: Xi - current hash value;
# Htable - table precomputed in gcm_init_v8;
# output: Xi - next hash value Xi;
#
$code.=<<___;
.global gcm_gmult_v8
.type gcm_gmult_v8,%function
.align 4
gcm_gmult_v8:
AARCH64_VALID_CALL_TARGET
vld1.64 {$t1},[$Xi] @ load Xi
vmov.i8 $xC2,#0xe1
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
vext.8 $H,$H,$H,#8
vshl.u64 $xC2,$xC2,#57
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $IN,$t1,$t1,#8
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_gmult_v8,.-gcm_gmult_v8
___
################################################################################
# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#
# input: table precomputed in gcm_init_v8;
# current hash value Xi;
# pointer to input data;
# length of input data in bytes, but divisible by block size;
# output: next hash value Xi;
#
$code.=<<___;
.global gcm_ghash_v8
.type gcm_ghash_v8,%function
.align 4
gcm_ghash_v8:
AARCH64_VALID_CALL_TARGET
___
$code.=<<___ if ($flavour =~ /64/);
cmp $len,#64
b.hs .Lgcm_ghash_v8_4x
___
$code.=<<___ if ($flavour !~ /64/);
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ algorithm specification
subs $len,$len,#32 @ see if $len is 32 or larger
mov $inc,#16 @ $inc is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ $inc is zeroed just in time
@ to preclude overstepping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
vext.8 $H,$H,$H,#8
vmov.i8 $xC2,#0xe1
vld1.64 {$H2},[$Htbl]
vext.8 $H2,$H2,$H2,#8
cclr $inc,eq @ is it time to zero $inc?
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 $t0,$t0
vrev64.8 $Xl,$Xl
#endif
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
b.lo .Lodd_tail_v8 @ $len was less than 32
___
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
#######
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
# [(H*Ii+1) + (H*Xi+1)] mod P =
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
#
$code.=<<___;
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vext.8 $In,$t1,$t1,#8
veor $IN,$IN,$Xl @ I[i]^=Xi
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $t1,$t1,$In @ Karatsuba pre-processing
vpmull2.p64 $Xhn,$H,$In
b .Loop_mod2x_v8
.align 4
.Loop_mod2x_v8:
vext.8 $t2,$IN,$IN,#8
subs $len,$len,#32 @ is there more data?
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
cclr $inc,lo @ is it time to zero $inc?
vpmull.p64 $Xmn,$Hhl,$t1
veor $t2,$t2,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
veor $Xl,$Xl,$Xln @ accumulate
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
veor $Xh,$Xh,$Xhn
cclr $inc,eq @ is it time to zero $inc?
veor $Xm,$Xm,$Xmn
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 $t0,$t0
#endif
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 $t1,$t1
#endif
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $In,$t1,$t1,#8
vext.8 $IN,$t0,$t0,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Xln,$H,$In @ H·Ii+1
veor $IN,$IN,$Xh @ accumulate $IN early
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $IN,$IN,$t2
veor $t1,$t1,$In @ Karatsuba pre-processing
veor $IN,$IN,$Xl
vpmull2.p64 $Xhn,$H,$In
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
veor $Xh,$Xh,$t2
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
adds $len,$len,#32 @ re-construct $len
veor $Xl,$Xl,$Xh @ re-construct $Xl
b.eq .Ldone_v8 @ is $len zero?
___
}
$code.=<<___;
.Lodd_tail_v8:
vext.8 $t2,$Xl,$Xl,#8
veor $IN,$IN,$Xl @ inp^=Xi
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
veor $t1,$t1,$IN @ Karatsuba pre-processing
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
.Ldone_v8:
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vext.8 $Xl,$Xl,$Xl,#8
vst1.64 {$Xl},[$Xi] @ write out Xi
___
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15} @ 32-bit ABI says so
___
$code.=<<___;
ret
.size gcm_ghash_v8,.-gcm_ghash_v8
___
if ($flavour =~ /64/) { # 4x subroutine
my ($I0,$j1,$j2,$j3,
$I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
$code.=<<___;
.type gcm_ghash_v8_4x,%function
.align 4
gcm_ghash_v8_4x:
.Lgcm_ghash_v8_4x:
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
vext.8 $H,$H,$H,#8
vext.8 $H2,$H2,$H2,#8
vmov.i8 $xC2,#0xe1
vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
vext.8 $H3,$H3,$H3,#8
vext.8 $H4,$H4,$H4,#8
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
vld1.64 {$I0-$j3},[$inp],#64
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vext.8 $I3,$j3,$j3,#8
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
vpmull2.p64 $Yh,$H,$I3
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
subs $len,$len,#128
b.lo .Ltail4x
b .Loop4x
.align 4
.Loop4x:
veor $t0,$I0,$Xl
vld1.64 {$I0-$j3},[$inp],#64
vext.8 $IN,$t0,$t0,#8
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $j3,$j3
vrev64.8 $I0,$I0
#endif
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vext.8 $I3,$j3,$j3,#8
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
vext.8 $I2,$j2,$j2,#8
veor $Xm,$Xm,$Ym
vext.8 $I1,$j1,$j1,#8
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
veor $j3,$j3,$I3
veor $Xm,$Xm,$t1
vpmull2.p64 $Yh,$H,$I3
veor $Xm,$Xm,$t2
vpmull.p64 $Ym,$Hhl,$j3
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
veor $j2,$j2,$I2
vpmull2.p64 $I2,$H2,$I2
veor $Xl,$Xm,$t2
vpmull2.p64 $j2,$Hhl,$j2
veor $Yl,$Yl,$t0
veor $Yh,$Yh,$I2
veor $Ym,$Ym,$j2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
veor $j1,$j1,$I1
veor $t2,$t2,$Xh
vpmull2.p64 $I1,$H3,$I1
vpmull.p64 $j1,$H34,$j1
veor $Xl,$Xl,$t2
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
vext.8 $Xl,$Xl,$Xl,#8
veor $Ym,$Ym,$j1
subs $len,$len,#64
b.hs .Loop4x
.Ltail4x:
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H4,$IN
vpmull2.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
adds $len,$len,#64
b.eq .Ldone4x
cmp $len,#32
b.lo .Lone
b.eq .Ltwo
.Lthree:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j2},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $j2,$j2
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I2,$j2,$j2,#8
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
veor $j2,$j2,$I2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
vpmull2.p64 $Yh,$H,$I2
vpmull.p64 $Ym,$Hhl,$j2
veor $Xl,$Xl,$t2
vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
veor $j1,$j1,$I1
vext.8 $Xl,$Xl,$Xl,#8
vpmull2.p64 $I1,$H2,$I1
veor $t0,$I0,$Xl
vpmull2.p64 $j1,$Hhl,$j1
vext.8 $IN,$t0,$t0,#8
veor $Yl,$Yl,$j3
veor $Yh,$Yh,$I1
veor $Ym,$Ym,$j1
vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H3,$IN
vpmull.p64 $Xm,$H34,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Ltwo:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0-$j1},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $j1,$j1
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
vext.8 $I1,$j1,$j1,#8
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
veor $j1,$j1,$I1
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull2.p64 $Yh,$H,$I1
vpmull.p64 $Ym,$Hhl,$j1
vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H2,$IN
vpmull2.p64 $Xm,$Hhl,$t0
veor $Xl,$Xl,$Yl
veor $Xh,$Xh,$Yh
veor $Xm,$Xm,$Ym
b .Ldone4x
.align 4
.Lone:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
vld1.64 {$I0},[$inp]
veor $Xm,$Xm,$t2
#ifndef __ARMEB__
vrev64.8 $I0,$I0
#endif
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
veor $t0,$I0,$Xl
vext.8 $IN,$t0,$t0,#8
vpmull.p64 $Xl,$H,$IN
veor $t0,$t0,$IN
vpmull2.p64 $Xh,$H,$IN
vpmull.p64 $Xm,$Hhl,$t0
.Ldone4x:
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
veor $t2,$Xl,$Xh
veor $Xm,$Xm,$t1
veor $Xm,$Xm,$t2
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
veor $Xl,$Xm,$t2
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
vpmull.p64 $Xl,$Xl,$xC2
veor $t2,$t2,$Xh
veor $Xl,$Xl,$t2
vext.8 $Xl,$Xl,$Xl,#8
#ifndef __ARMEB__
vrev64.8 $Xl,$Xl
#endif
vst1.64 {$Xl},[$Xi] @ write out Xi
ret
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
___
}
}
$code.=<<___;
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#endif
___
if ($flavour =~ /64/) { ######## 64-bit code
sub unvmov {
my $arg=shift;
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
$3<8?$3:$3+8,($4 eq "lo")?0:1;
}
foreach(split("\n",$code)) {
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
s/vmov\.i8/movi/o or # fix up legacy mnemonics
s/vmov\s+(.*)/unvmov($1)/geo or
s/vext\.8/ext/o or
s/vshr\.s/sshr\.s/o or
s/vshr/ushr/o or
s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
s/@\s/\/\//o; # old->new style commentary
# fix up remaining legacy suffixes
s/\.[ui]?8(\s)/$1/o;
s/\.[uis]?32//o and s/\.16b/\.4s/go;
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
# Switch preprocessor checks to aarch64 versions.
s/__ARME([BL])__/__AARCH64E$1__/go;
print $_,"\n";
}
} else { ######## 32-bit code
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvpmullp64 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
$word |= 0x00010001 if ($mnemonic =~ "2");
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
s/\/\/\s?/@ /o; # new->old style commentary
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT"; # enforce flush

View File

@@ -0,0 +1,115 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <assert.h>
#include <string.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../../internal.h"
void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
block128_f block) {
assert(key != NULL && ivec != NULL);
if (len == 0) {
// Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
return;
}
assert(in != NULL && out != NULL);
size_t n;
const uint8_t *iv = ivec;
while (len >= 16) {
CRYPTO_xor16(out, in, iv);
(*block)(out, out, key);
iv = out;
len -= 16;
in += 16;
out += 16;
}
if (len > 0) {
for (n = 0; n < 16 && n < len; ++n) {
out[n] = in[n] ^ iv[n];
}
for (; n < 16; ++n) {
out[n] = iv[n];
}
(*block)(out, out, key);
iv = out;
}
OPENSSL_memcpy(ivec, iv, 16);
}
void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
block128_f block) {
assert(key != NULL && ivec != NULL);
if (len == 0) {
// Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C.
return;
}
assert(in != NULL && out != NULL);
const uintptr_t inptr = (uintptr_t) in;
const uintptr_t outptr = (uintptr_t) out;
// If |in| and |out| alias, |in| must be ahead.
assert(inptr >= outptr || inptr + len <= outptr);
size_t n;
alignas(16) uint8_t tmp[16];
if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) {
// If |out| is at least two blocks behind |in| or completely disjoint, there
// is no need to decrypt to a temporary block.
const uint8_t *iv = ivec;
while (len >= 16) {
(*block)(in, out, key);
CRYPTO_xor16(out, out, iv);
iv = in;
len -= 16;
in += 16;
out += 16;
}
OPENSSL_memcpy(ivec, iv, 16);
} else {
OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
block_cannot_be_evenly_divided_into_crypto_word_t)
while (len >= 16) {
(*block)(in, tmp, key);
for (n = 0; n < 16; n += sizeof(crypto_word_t)) {
crypto_word_t c = CRYPTO_load_word_le(in + n);
CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(tmp + n) ^
CRYPTO_load_word_le(ivec + n));
CRYPTO_store_word_le(ivec + n, c);
}
len -= 16;
in += 16;
out += 16;
}
}
while (len) {
uint8_t c;
(*block)(in, tmp, key);
for (n = 0; n < 16 && n < len; ++n) {
c = in[n];
out[n] = tmp[n] ^ ivec[n];
ivec[n] = c;
}
if (len <= 16) {
for (; n < 16; ++n) {
ivec[n] = in[n];
}
break;
}
len -= 16;
in += 16;
out += 16;
}
}

View File

@@ -0,0 +1,158 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/type_check.h>
#include <assert.h>
#include <string.h>
#include "internal.h"
OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
cfb_block_cannot_be_divided_into_size_t)
void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16], unsigned *num,
int enc, block128_f block) {
assert(in && out && key && ivec && num);
unsigned n = *num;
if (enc) {
while (n && len) {
*(out++) = ivec[n] ^= *(in++);
--len;
n = (n + 1) % 16;
}
while (len >= 16) {
(*block)(ivec, ivec, key);
for (; n < 16; n += sizeof(crypto_word_t)) {
crypto_word_t tmp =
CRYPTO_load_word_le(ivec + n) ^ CRYPTO_load_word_le(in + n);
CRYPTO_store_word_le(ivec + n, tmp);
CRYPTO_store_word_le(out + n, tmp);
}
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block)(ivec, ivec, key);
while (len--) {
out[n] = ivec[n] ^= in[n];
++n;
}
}
*num = n;
return;
} else {
while (n && len) {
uint8_t c;
*(out++) = ivec[n] ^ (c = *(in++));
ivec[n] = c;
--len;
n = (n + 1) % 16;
}
while (len >= 16) {
(*block)(ivec, ivec, key);
for (; n < 16; n += sizeof(crypto_word_t)) {
crypto_word_t t = CRYPTO_load_word_le(in + n);
CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(ivec + n) ^ t);
CRYPTO_store_word_le(ivec + n, t);
}
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block)(ivec, ivec, key);
while (len--) {
uint8_t c;
out[n] = ivec[n] ^ (c = in[n]);
ivec[n] = c;
++n;
}
}
*num = n;
return;
}
}
/* This expects a single block of size nbits for both in and out. Note that
it corrupts any extra bits in the last byte of out */
static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits,
const AES_KEY *key, uint8_t ivec[16], int enc,
block128_f block) {
int n, rem, num;
uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one
byte off the end */
if (nbits <= 0 || nbits > 128) {
return;
}
// fill in the first half of the new IV with the current IV
OPENSSL_memcpy(ovec, ivec, 16);
// construct the new IV
(*block)(ivec, ivec, key);
num = (nbits + 7) / 8;
if (enc) {
// encrypt the input
for (n = 0; n < num; ++n) {
out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
}
} else {
// decrypt the input
for (n = 0; n < num; ++n) {
out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
}
}
// shift ovec left...
rem = nbits % 8;
num = nbits / 8;
if (rem == 0) {
OPENSSL_memcpy(ivec, ovec + num, 16);
} else {
for (n = 0; n < 16; ++n) {
ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
}
}
// it is not necessary to cleanse ovec, since the IV is not secret
}
// N.B. This expects the input to be packed, MS bit first
void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits,
const AES_KEY *key, uint8_t ivec[16],
unsigned *num, int enc, block128_f block) {
size_t n;
uint8_t c[1], d[1];
assert(in && out && key && ivec && num);
assert(*num == 0);
for (n = 0; n < bits; ++n) {
c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
((d[0] & 0x80) >> (unsigned int)(n % 8));
}
}
void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char ivec[16], unsigned *num, int enc,
block128_f block) {
size_t n;
assert(in && out && key && ivec && num);
assert(*num == 0);
for (n = 0; n < length; ++n) {
cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
}
}

View File

@@ -0,0 +1,153 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/type_check.h>
#include <assert.h>
#include <string.h>
#include "internal.h"
#include "../../internal.h"
// NOTE: the IV/counter CTR mode is big-endian. The code itself
// is endian-neutral.
// increment counter (128-bit int) by 1
static void ctr128_inc(uint8_t *counter) {
uint32_t n = 16, c = 1;
do {
--n;
c += counter[n];
counter[n] = (uint8_t) c;
c >>= 8;
} while (n);
}
OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
ctr_block_cannot_be_divided_into_crypto_word_t)
// The input encrypted as though 128bit counter mode is being used. The extra
// state information to record how much of the 128bit block we have used is
// contained in *num, and the encrypted counter is kept in ecount_buf. Both
// *num and ecount_buf must be initialised with zeros before the first call to
// CRYPTO_ctr128_encrypt().
//
// This algorithm assumes that the counter is in the x lower bits of the IV
// (ivec), and that the application has full control over overflow and the rest
// of the IV. This implementation takes NO responsibility for checking that
// the counter doesn't overflow into the rest of the IV when incremented.
void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t ecount_buf[16], unsigned int *num,
block128_f block) {
unsigned int n;
assert(key && ecount_buf && num);
assert(len == 0 || (in && out));
assert(*num < 16);
n = *num;
while (n && len) {
*(out++) = *(in++) ^ ecount_buf[n];
--len;
n = (n + 1) % 16;
}
while (len >= 16) {
(*block)(ivec, ecount_buf, key);
ctr128_inc(ivec);
CRYPTO_xor16(out, in, ecount_buf);
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block)(ivec, ecount_buf, key);
ctr128_inc(ivec);
while (len--) {
out[n] = in[n] ^ ecount_buf[n];
++n;
}
}
*num = n;
}
// increment upper 96 bits of 128-bit counter by 1
static void ctr96_inc(uint8_t *counter) {
uint32_t n = 12, c = 1;
do {
--n;
c += counter[n];
counter[n] = (uint8_t) c;
c >>= 8;
} while (n);
}
void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t ecount_buf[16], unsigned int *num,
ctr128_f func) {
unsigned int n, ctr32;
assert(key && ecount_buf && num);
assert(len == 0 || (in && out));
assert(*num < 16);
n = *num;
while (n && len) {
*(out++) = *(in++) ^ ecount_buf[n];
--len;
n = (n + 1) % 16;
}
ctr32 = CRYPTO_load_u32_be(ivec + 12);
while (len >= 16) {
size_t blocks = len / 16;
// 1<<28 is just a not-so-small yet not-so-large number...
// Below condition is practically never met, but it has to
// be checked for code correctness.
if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) {
blocks = (1U << 28);
}
// As (*func) operates on 32-bit counter, caller
// has to handle overflow. 'if' below detects the
// overflow, which is then handled by limiting the
// amount of blocks to the exact overflow point...
ctr32 += (uint32_t)blocks;
if (ctr32 < blocks) {
blocks -= ctr32;
ctr32 = 0;
}
(*func)(in, out, blocks, key, ivec);
// (*func) does not update ivec, caller does:
CRYPTO_store_u32_be(ivec + 12, ctr32);
// ... overflow was detected, propogate carry.
if (ctr32 == 0) {
ctr96_inc(ivec);
}
blocks *= 16;
len -= blocks;
out += blocks;
in += blocks;
}
if (len) {
OPENSSL_memset(ecount_buf, 0, 16);
(*func)(ecount_buf, ecount_buf, 1, key, ivec);
++ctr32;
CRYPTO_store_u32_be(ivec + 12, ctr32);
if (ctr32 == 0) {
ctr96_inc(ivec);
}
while (len--) {
out[n] = in[n] ^ ecount_buf[n];
++n;
}
}
*num = n;
}

View File

@@ -0,0 +1,850 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/base.h>
#include <assert.h>
#include <string.h>
#include <openssl/mem.h>
#include "internal.h"
#include "../../internal.h"
// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
// bits of a |size_t|.
static const size_t kSizeTWithoutLower4Bits = (size_t) -16;
#define GCM_MUL(ctx, Xi) gcm_gmult_nohw((ctx)->Xi, (ctx)->gcm_key.Htable)
#define GHASH(ctx, in, len) \
gcm_ghash_nohw((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
// GHASH_CHUNK is "stride parameter" missioned to mitigate cache
// trashing effect. In other words idea is to hash data while it's
// still in L1 cache after encryption pass...
#define GHASH_CHUNK (3 * 1024)
#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86)
static inline void gcm_reduce_1bit(u128 *V) {
if (sizeof(crypto_word_t) == 8) {
uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V->hi & 1));
V->hi = (V->lo << 63) | (V->hi >> 1);
V->lo = (V->lo >> 1) ^ T;
} else {
uint32_t T = 0xe1000000U & (0 - (uint32_t)(V->hi & 1));
V->hi = (V->lo << 63) | (V->hi >> 1);
V->lo = (V->lo >> 1) ^ ((uint64_t)T << 32);
}
}
void gcm_init_ssse3(u128 Htable[16], const uint64_t H[2]) {
Htable[0].hi = 0;
Htable[0].lo = 0;
u128 V;
V.hi = H[1];
V.lo = H[0];
Htable[8] = V;
gcm_reduce_1bit(&V);
Htable[4] = V;
gcm_reduce_1bit(&V);
Htable[2] = V;
gcm_reduce_1bit(&V);
Htable[1] = V;
Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
V = Htable[4];
Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
V = Htable[8];
Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
// Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i]
// contains the i'th byte of j*H for all j.
uint8_t *Hbytes = (uint8_t *)Htable;
for (int i = 0; i < 16; i++) {
for (int j = 0; j < i; j++) {
uint8_t tmp = Hbytes[16*i + j];
Hbytes[16*i + j] = Hbytes[16*j + i];
Hbytes[16*j + i] = tmp;
}
}
}
#endif // GHASH_ASM_X86_64 || GHASH_ASM_X86
#ifdef GCM_FUNCREF
#undef GCM_MUL
#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi, (ctx)->gcm_key.Htable)
#undef GHASH
#define GHASH(ctx, in, len) \
(*gcm_ghash_p)((ctx)->Xi, (ctx)->gcm_key.Htable, in, len)
#endif // GCM_FUNCREF
#if defined(HW_GCM) && defined(OPENSSL_X86_64)
static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t Xi[16], const u128 Htable[16]) {
return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi);
}
static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t Xi[16], const u128 Htable[16]) {
return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi);
}
#endif // HW_GCM && X86_64
#if defined(HW_GCM) && defined(OPENSSL_AARCH64)
static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t Xi[16], const u128 Htable[16]) {
const size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (!len_blocks) {
return 0;
}
// The 8x-unrolled assembly implementation starts outperforming
// the 4x-unrolled one starting around input length of 256 bytes
// in the case of the EVP API.
// In the case of the AEAD API, it can be used for all input lengths
// but we are not identifying which API calls the code below.
if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) {
switch(key->rounds) {
case 10:
aesv8_gcm_8x_enc_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
case 12:
aesv8_gcm_8x_enc_192(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
case 14:
aesv8_gcm_8x_enc_256(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
default:
// The subsequent logic after returning can process
// the input or return an error.
return 0;
break;
}
} else {
aes_gcm_enc_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable);
}
return len_blocks;
}
static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t Xi[16], const u128 Htable[16]) {
const size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (!len_blocks) {
return 0;
}
// The 8x-unrolled assembly implementation starts outperforming
// the 4x-unrolled one starting around input length of 256 bytes
// in the case of the EVP API.
// In the case of the AEAD API, it can be used for all input lengths
// but we are not identifying which API calls the code below.
if (CRYPTO_is_ARMv8_GCM_8x_capable() && len >= 256) {
switch(key->rounds) {
case 10:
aesv8_gcm_8x_dec_128(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
case 12:
aesv8_gcm_8x_dec_192(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
case 14:
aesv8_gcm_8x_dec_256(in, len_blocks * 8, out, Xi, ivec, key, Htable);
break;
default:
// The subsequent logic after returning can process
// the input or return an error.
return 0;
break;
}
} else {
aes_gcm_dec_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable);
}
return len_blocks;
}
#endif // HW_GCM && AARCH64
// Trampolines for GCM function pointers to avoid delocator issues with adr
// on AArch64. Without these wrappers, the function pointer calculations
// may require PC-relative offsets outside the addressable range.
#if defined(GHASH_ASM_ARM)
static inline void gcm_gmult_v8_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
gcm_gmult_v8(Xi, Htable);
}
static inline void gcm_ghash_v8_wrapper(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *inp, size_t len) {
gcm_ghash_v8(Xi, Htable, inp, len);
}
static inline void gcm_gmult_neon_wrapper(uint8_t Xi[16], const u128 Htable[16]) {
gcm_gmult_neon(Xi, Htable);
}
static inline void gcm_ghash_neon_wrapper(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *inp, size_t len) {
gcm_ghash_neon(Xi, Htable, inp, len);
}
#endif
void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
u128 out_table[16], int *out_is_avx,
const uint8_t gcm_key[16]) {
*out_is_avx = 0;
// H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values.
uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key),
CRYPTO_load_u64_be(gcm_key + 8)};
#if defined(GHASH_ASM_X86_64)
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (crypto_gcm_avx512_enabled()) {
gcm_init_avx512(out_table, H);
*out_mult = gcm_gmult_avx512;
*out_hash = gcm_ghash_avx512;
*out_is_avx = 1;
return;
}
#endif
if (crypto_gcm_clmul_enabled()) {
if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
gcm_init_avx(out_table, H);
*out_mult = gcm_gmult_avx;
*out_hash = gcm_ghash_avx;
*out_is_avx = 1;
return;
}
gcm_init_clmul(out_table, H);
*out_mult = gcm_gmult_clmul;
*out_hash = gcm_ghash_clmul;
return;
}
if (CRYPTO_is_SSSE3_capable()) {
gcm_init_ssse3(out_table, H);
*out_mult = gcm_gmult_ssse3;
*out_hash = gcm_ghash_ssse3;
return;
}
#elif defined(GHASH_ASM_X86)
if (crypto_gcm_clmul_enabled()) {
gcm_init_clmul(out_table, H);
*out_mult = gcm_gmult_clmul;
*out_hash = gcm_ghash_clmul;
return;
}
if (CRYPTO_is_SSSE3_capable()) {
gcm_init_ssse3(out_table, H);
*out_mult = gcm_gmult_ssse3;
*out_hash = gcm_ghash_ssse3;
return;
}
#elif defined(GHASH_ASM_ARM)
if (gcm_pmull_capable()) {
gcm_init_v8(out_table, H);
*out_mult = gcm_gmult_v8_wrapper;
*out_hash = gcm_ghash_v8_wrapper;
return;
}
if (gcm_neon_capable()) {
gcm_init_neon(out_table, H);
*out_mult = gcm_gmult_neon_wrapper;
*out_hash = gcm_ghash_neon_wrapper;
return;
}
#elif defined(GHASH_ASM_PPC64LE)
if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
gcm_init_p8(out_table, H);
*out_mult = gcm_gmult_p8;
*out_hash = gcm_ghash_p8;
return;
}
#endif
gcm_init_nohw(out_table, H);
*out_mult = gcm_gmult_nohw;
*out_hash = gcm_ghash_nohw;
}
void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key, const AES_KEY *aes_key,
block128_f block, int block_is_hwaes) {
OPENSSL_memset(gcm_key, 0, sizeof(*gcm_key));
gcm_key->block = block;
uint8_t ghash_key[16];
OPENSSL_memset(ghash_key, 0, sizeof(ghash_key));
(*block)(ghash_key, ghash_key, aes_key);
int is_avx;
CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable, &is_avx,
ghash_key);
#if defined(OPENSSL_AARCH64) && defined(GHASH_ASM_ARM)
gcm_key->use_hw_gcm_crypt = (gcm_pmull_capable() && block_is_hwaes) ? 1 : 0;
#else
gcm_key->use_hw_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0;
#endif
}
void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *iv, size_t len) {
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
#endif
OPENSSL_memset(&ctx->Yi, 0, sizeof(ctx->Yi));
OPENSSL_memset(&ctx->Xi, 0, sizeof(ctx->Xi));
ctx->len.aad = 0;
ctx->len.msg = 0;
ctx->ares = 0;
ctx->mres = 0;
#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled()) {
gcm_setiv_avx512(key, ctx, iv, len);
return;
}
#endif
uint32_t ctr;
if (len == 12) {
OPENSSL_memcpy(ctx->Yi, iv, 12);
ctx->Yi[15] = 1;
ctr = 1;
} else {
uint64_t len0 = len;
while (len >= 16) {
CRYPTO_xor16(ctx->Yi, ctx->Yi, iv);
GCM_MUL(ctx, Yi);
iv += 16;
len -= 16;
}
if (len) {
for (size_t i = 0; i < len; ++i) {
ctx->Yi[i] ^= iv[i];
}
GCM_MUL(ctx, Yi);
}
uint8_t len_block[16];
OPENSSL_memset(len_block, 0, 8);
CRYPTO_store_u64_be(len_block + 8, len0 << 3);
CRYPTO_xor16(ctx->Yi, ctx->Yi, len_block);
GCM_MUL(ctx, Yi);
ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
}
(*ctx->gcm_key.block)(ctx->Yi, ctx->EK0, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
}
int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) {
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
if (ctx->len.msg != 0) {
// The caller must have finished the AAD before providing other input.
return 0;
}
uint64_t alen = ctx->len.aad + len;
if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) {
return 0;
}
ctx->len.aad = alen;
unsigned n = ctx->ares;
if (n) {
while (n && len) {
ctx->Xi[n] ^= *(aad++);
--len;
n = (n + 1) % 16;
}
if (n == 0) {
GCM_MUL(ctx, Xi);
} else {
ctx->ares = n;
return 1;
}
}
// Process a whole number of blocks.
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
GHASH(ctx, aad, len_blocks);
aad += len_blocks;
len -= len_blocks;
}
// Process the remainder.
if (len != 0) {
// This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
// .../aws-lc/crypto/fipsmodule/modes/gcm.c:428:18: error: writing 1 byte into
// a region of size 0 [-Werror=stringop-overflow=]
// 428 | ctx->Xi[i] ^= aad[i];
// | ~~~~~~~~~~~^~~~~~~~~
if (len > 16) {
abort();
return 0;
}
n = (unsigned int)len;
for (size_t i = 0; i < len; ++i) {
ctx->Xi[i] ^= aad[i];
}
}
ctx->ares = n;
return 1;
}
int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len) {
block128_f block = ctx->gcm_key.block;
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
uint64_t mlen = ctx->len.msg + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
}
ctx->len.msg = mlen;
if (ctx->ares) {
// First call to encrypt finalizes GHASH(AAD)
GCM_MUL(ctx, Xi);
ctx->ares = 0;
}
unsigned n = ctx->mres;
if (n) {
while (n && len) {
ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
--len;
n = (n + 1) % 16;
}
if (n == 0) {
GCM_MUL(ctx, Xi);
} else {
ctx->mres = n;
return 1;
}
}
uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
while (len >= GHASH_CHUNK) {
size_t j = GHASH_CHUNK;
while (j) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
CRYPTO_xor16(out, in, ctx->EKi);
out += 16;
in += 16;
j -= 16;
}
GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
len -= GHASH_CHUNK;
}
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
while (len >= 16) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
CRYPTO_xor16(out, in, ctx->EKi);
out += 16;
in += 16;
len -= 16;
}
GHASH(ctx, out - len_blocks, len_blocks);
}
if (len) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
while (len--) {
ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
++n;
}
}
ctx->mres = n;
return 1;
}
int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key,
const unsigned char *in, unsigned char *out,
size_t len) {
block128_f block = ctx->gcm_key.block;
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
uint64_t mlen = ctx->len.msg + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
}
ctx->len.msg = mlen;
if (ctx->ares) {
// First call to decrypt finalizes GHASH(AAD)
GCM_MUL(ctx, Xi);
ctx->ares = 0;
}
unsigned n = ctx->mres;
if (n) {
while (n && len) {
uint8_t c = *(in++);
*(out++) = c ^ ctx->EKi[n];
ctx->Xi[n] ^= c;
--len;
n = (n + 1) % 16;
}
if (n == 0) {
GCM_MUL(ctx, Xi);
} else {
ctx->mres = n;
return 1;
}
}
uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
while (len >= GHASH_CHUNK) {
size_t j = GHASH_CHUNK;
GHASH(ctx, in, GHASH_CHUNK);
while (j) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
CRYPTO_xor16(out, in, ctx->EKi);
out += 16;
in += 16;
j -= 16;
}
len -= GHASH_CHUNK;
}
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
GHASH(ctx, in, len_blocks);
while (len >= 16) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
CRYPTO_xor16(out, in, ctx->EKi);
out += 16;
in += 16;
len -= 16;
}
}
if (len) {
(*block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
while (len--) {
uint8_t c = in[n];
ctx->Xi[n] ^= c;
out[n] = c ^ ctx->EKi[n];
++n;
}
}
ctx->mres = n;
return 1;
}
int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len,
ctr128_f stream) {
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
uint64_t mlen = ctx->len.msg + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
}
ctx->len.msg = mlen;
if (ctx->ares) {
// First call to encrypt finalizes GHASH(AAD)
GCM_MUL(ctx, Xi);
ctx->ares = 0;
}
#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled() && len > 0) {
aes_gcm_encrypt_avx512(key, ctx, &ctx->mres, in, len, out);
return 1;
}
#endif
unsigned n = ctx->mres;
if (n) {
while (n && len) {
ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n];
--len;
n = (n + 1) % 16;
}
if (n == 0) {
GCM_MUL(ctx, Xi);
} else {
ctx->mres = n;
return 1;
}
}
#if defined(HW_GCM)
// Check |len| to work around a C language bug. See https://crbug.com/1019588.
if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
// |hw_gcm_encrypt| may not process all the input given to it. It may
// not process *any* of its input if it is deemed too small.
size_t bulk = hw_gcm_encrypt(in, out, len, key, ctx->Yi, ctx->Xi,
ctx->gcm_key.Htable);
in += bulk;
out += bulk;
len -= bulk;
}
#endif
uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
while (len >= GHASH_CHUNK) {
(*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
ctr += GHASH_CHUNK / 16;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
GHASH(ctx, out, GHASH_CHUNK);
out += GHASH_CHUNK;
in += GHASH_CHUNK;
len -= GHASH_CHUNK;
}
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
size_t j = len_blocks / 16;
(*stream)(in, out, j, key, ctx->Yi);
ctr += (unsigned int)j;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
in += len_blocks;
len -= len_blocks;
GHASH(ctx, out, len_blocks);
out += len_blocks;
}
if (len) {
(*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
// This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
// .../aws-lc/crypto/fipsmodule/modes/gcm.c:688:18: error: writing 1 byte into a region of size 0 [-Werror=stringop-overflow=]
// 688 | ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
// | ^~
if ((n + len) > 16) {
abort();
return 0;
}
while (len--) {
ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n];
++n;
}
}
ctx->mres = n;
return 1;
}
int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *in, uint8_t *out, size_t len,
ctr128_f stream) {
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) = ctx->gcm_key.ghash;
#endif
uint64_t mlen = ctx->len.msg + len;
if (mlen > ((UINT64_C(1) << 36) - 32) ||
(sizeof(len) == 8 && mlen < len)) {
return 0;
}
ctx->len.msg = mlen;
if (ctx->ares) {
// First call to decrypt finalizes GHASH(AAD)
GCM_MUL(ctx, Xi);
ctx->ares = 0;
}
#if defined(GHASH_ASM_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ctx->gcm_key.use_hw_gcm_crypt && crypto_gcm_avx512_enabled() && len > 0) {
aes_gcm_decrypt_avx512(key, ctx, &ctx->mres, in, len, out);
return 1;
}
#endif
unsigned n = ctx->mres;
if (n) {
while (n && len) {
uint8_t c = *(in++);
*(out++) = c ^ ctx->EKi[n];
ctx->Xi[n] ^= c;
--len;
n = (n + 1) % 16;
}
if (n == 0) {
GCM_MUL(ctx, Xi);
} else {
ctx->mres = n;
return 1;
}
}
#if defined(HW_GCM)
// Check |len| to work around a C language bug. See https://crbug.com/1019588.
if (ctx->gcm_key.use_hw_gcm_crypt && len > 0) {
// |hw_gcm_decrypt| may not process all the input given to it. It may
// not process *any* of its input if it is deemed too small.
size_t bulk = hw_gcm_decrypt(in, out, len, key, ctx->Yi, ctx->Xi,
ctx->gcm_key.Htable);
in += bulk;
out += bulk;
len -= bulk;
}
#endif
uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12);
while (len >= GHASH_CHUNK) {
GHASH(ctx, in, GHASH_CHUNK);
(*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi);
ctr += GHASH_CHUNK / 16;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
out += GHASH_CHUNK;
in += GHASH_CHUNK;
len -= GHASH_CHUNK;
}
size_t len_blocks = len & kSizeTWithoutLower4Bits;
if (len_blocks != 0) {
size_t j = len_blocks / 16;
GHASH(ctx, in, len_blocks);
(*stream)(in, out, j, key, ctx->Yi);
ctr += (unsigned int)j;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
out += len_blocks;
in += len_blocks;
len -= len_blocks;
}
if (len) {
(*ctx->gcm_key.block)(ctx->Yi, ctx->EKi, key);
++ctr;
CRYPTO_store_u32_be(ctx->Yi + 12, ctr);
// This is needed to avoid a compiler warning on powerpc64le using GCC 12.2:
// aws-lc/crypto/fipsmodule/modes/gcm.c:785:18: error: writing 1 byte into a
// region of size 0 [-Werror=stringop-overflow=]
// 785 | ctx->Xi[n] ^= c;
// | ~~~~~~~~~~~^~~~
if ((n + len) > 16) {
abort();
return 0;
}
while (len--) {
uint8_t c = in[n];
ctx->Xi[n] ^= c;
out[n] = c ^ ctx->EKi[n];
++n;
}
}
ctx->mres = n;
return 1;
}
int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) {
#ifdef GCM_FUNCREF
void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) =
ctx->gcm_key.gmult;
#endif
if (ctx->mres || ctx->ares) {
GCM_MUL(ctx, Xi);
}
uint8_t len_block[16];
CRYPTO_store_u64_be(len_block, ctx->len.aad << 3);
CRYPTO_store_u64_be(len_block + 8, ctx->len.msg << 3);
CRYPTO_xor16(ctx->Xi, ctx->Xi, len_block);
GCM_MUL(ctx, Xi);
CRYPTO_xor16(ctx->Xi, ctx->Xi, ctx->EK0);
if (tag && len <= sizeof(ctx->Xi)) {
return CRYPTO_memcmp(ctx->Xi, tag, len) == 0;
} else {
return 0;
}
}
void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) {
CRYPTO_gcm128_finish(ctx, NULL, 0);
OPENSSL_memcpy(tag, ctx->Xi, len <= sizeof(ctx->Xi) ? len : sizeof(ctx->Xi));
}
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
int crypto_gcm_clmul_enabled(void) {
#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable();
#else
return 0;
#endif
}
int crypto_gcm_avx512_enabled(void) {
// This must align with ImplDispatchTest.AEAD_AES_GCM
#if defined(GHASH_ASM_X86_64) && \
!defined(OPENSSL_WINDOWS) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
// TODO(awslc): remove the Windows guard once CryptoAlg-1701 is resolved.
return (CRYPTO_is_VAES_capable() &&
CRYPTO_is_AVX512_capable() &&
CRYPTO_is_VPCLMULQDQ_capable());
#else
return 0;
#endif
}
#endif

View File

@@ -0,0 +1,291 @@
// Copyright (c) 2019, Google Inc.
// SPDX-License-Identifier: ISC
#include <openssl/base.h>
#include "../../internal.h"
#include "internal.h"
#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
#include <emmintrin.h>
#endif
// This file contains a constant-time implementation of GHASH based on the notes
// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
// algorithm described in
// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
//
// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our
// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run
// the 32-bit implementation, but we can use its intrinsics if necessary.
#if defined(BORINGSSL_HAS_UINT128)
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
uint64_t b) {
// One term every four bits means the largest term is 64/4 = 16, which barely
// overflows into the next term. Using one term every five bits would cost 25
// multiplications instead of 16. It is faster to mask off the bottom four
// bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits
// separately.
uint64_t a0 = a & UINT64_C(0x1111111111111110);
uint64_t a1 = a & UINT64_C(0x2222222222222220);
uint64_t a2 = a & UINT64_C(0x4444444444444440);
uint64_t a3 = a & UINT64_C(0x8888888888888880);
uint64_t b0 = b & UINT64_C(0x1111111111111111);
uint64_t b1 = b & UINT64_C(0x2222222222222222);
uint64_t b2 = b & UINT64_C(0x4444444444444444);
uint64_t b3 = b & UINT64_C(0x8888888888888888);
uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^
(a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1);
uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^
(a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2);
uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^
(a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3);
uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^
(a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0);
// Multiply the bottom four bits of |a| with |b|.
uint64_t a0_mask = UINT64_C(0) - (a & 1);
uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1);
uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1);
uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1);
uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^
((uint128_t)(a2_mask & b) << 2) ^
((uint128_t)(a3_mask & b) << 3);
*out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^
(((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^
(((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^
(((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra);
*out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^
(((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^
(((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^
(((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^
((uint64_t)(extra >> 64));
}
#elif defined(OPENSSL_SSE2)
static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
// One term every four bits means the largest term is 32/4 = 8, which does not
// overflow into the next term.
__m128i aa = _mm_setr_epi32(a, 0, a, 0);
__m128i bb = _mm_setr_epi32(b, 0, b, 0);
__m128i a0a0 =
_mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0));
__m128i a2a2 =
_mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0));
__m128i b0b1 =
_mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0));
__m128i b2b3 =
_mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0));
__m128i c0c1 =
_mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3));
__m128i c2c3 =
_mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3));
__m128i a1a1 =
_mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0));
__m128i a3a3 =
_mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0));
__m128i b3b0 =
_mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0));
__m128i b1b2 =
_mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0));
c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0));
c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2));
c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0));
c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2));
c0c1 = _mm_and_si128(
c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222));
c2c3 = _mm_and_si128(
c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888));
c0c1 = _mm_xor_si128(c0c1, c2c3);
// c0 ^= c1
c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8));
return c0c1;
}
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
uint64_t b) {
uint32_t a0 = a & 0xffffffff;
uint32_t a1 = a >> 32;
uint32_t b0 = b & 0xffffffff;
uint32_t b1 = b >> 32;
// Karatsuba multiplication.
__m128i lo = gcm_mul32_nohw(a0, b0);
__m128i hi = gcm_mul32_nohw(a1, b1);
__m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1);
mid = _mm_xor_si128(mid, lo);
mid = _mm_xor_si128(mid, hi);
__m128i ret = _mm_unpacklo_epi64(lo, hi);
mid = _mm_slli_si128(mid, 4);
mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0));
ret = _mm_xor_si128(ret, mid);
memcpy(out_lo, &ret, 8);
memcpy(out_hi, ((char*)&ret) + 8, 8);
}
#else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2
static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) {
// One term every four bits means the largest term is 32/4 = 8, which does not
// overflow into the next term.
uint32_t a0 = a & 0x11111111;
uint32_t a1 = a & 0x22222222;
uint32_t a2 = a & 0x44444444;
uint32_t a3 = a & 0x88888888;
uint32_t b0 = b & 0x11111111;
uint32_t b1 = b & 0x22222222;
uint32_t b2 = b & 0x44444444;
uint32_t b3 = b & 0x88888888;
uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^
(a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1);
uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^
(a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2);
uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^
(a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3);
uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^
(a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0);
return (c0 & UINT64_C(0x1111111111111111)) |
(c1 & UINT64_C(0x2222222222222222)) |
(c2 & UINT64_C(0x4444444444444444)) |
(c3 & UINT64_C(0x8888888888888888));
}
static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
uint64_t b) {
uint32_t a0 = a & 0xffffffff;
uint32_t a1 = a >> 32;
uint32_t b0 = b & 0xffffffff;
uint32_t b1 = b >> 32;
// Karatsuba multiplication.
uint64_t lo = gcm_mul32_nohw(a0, b0);
uint64_t hi = gcm_mul32_nohw(a1, b1);
uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
*out_lo = lo ^ (mid << 32);
*out_hi = hi ^ (mid >> 32);
}
#endif // BORINGSSL_HAS_UINT128
void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) {
// We implement GHASH in terms of POLYVAL, as described in RFC 8452. This
// avoids a shift by 1 in the multiplication, needed to account for bit
// reversal losing a bit after multiplication, that is,
// rev128(X) * rev128(Y) = rev255(X*Y).
//
// Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation
// applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped.
//
// See also slide 16 of
// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf
Htable[0].lo = Xi[1];
Htable[0].hi = Xi[0];
uint64_t carry = Htable[0].hi >> 63;
carry = 0u - carry;
Htable[0].hi <<= 1;
Htable[0].hi |= Htable[0].lo >> 63;
Htable[0].lo <<= 1;
// The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we
// conditionally add 0xc200...0001.
Htable[0].lo ^= carry & 1;
Htable[0].hi ^= carry & UINT64_C(0xc200000000000000);
// This implementation does not use the rest of |Htable|.
}
static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) {
// Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0|
// through |r3|. Note there is no byte or bit reversal because we are
// evaluating POLYVAL.
uint64_t r0, r1;
gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo);
uint64_t r2, r3;
gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi);
uint64_t mid0, mid1;
gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo);
mid0 ^= r0 ^ r2;
mid1 ^= r1 ^ r3;
r2 ^= mid1;
r1 ^= mid0;
// Now we multiply our 256-bit result by x^-128 and reduce. |r2| and
// |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We
// have:
//
// 1 = x^121 + x^126 + x^127 + x^128
// x^-128 = x^-7 + x^-2 + x^-1 + 1
//
// This is the GHASH reduction step, but with bits flowing in reverse.
// The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require
// another reduction steps. Instead, we gather the excess bits, incorporate
// them into |r0| and |r1| and reduce once. See slides 17-19
// of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57);
// 1
r2 ^= r0;
r3 ^= r1;
// x^-1
r2 ^= r0 >> 1;
r2 ^= r1 << 63;
r3 ^= r1 >> 1;
// x^-2
r2 ^= r0 >> 2;
r2 ^= r1 << 62;
r3 ^= r1 >> 2;
// x^-7
r2 ^= r0 >> 7;
r2 ^= r1 << 57;
r3 ^= r1 >> 7;
Xi[0] = r2;
Xi[1] = r3;
}
void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) {
uint64_t swapped[2];
swapped[0] = CRYPTO_load_u64_be(Xi + 8);
swapped[1] = CRYPTO_load_u64_be(Xi);
gcm_polyval_nohw(swapped, &Htable[0]);
CRYPTO_store_u64_be(Xi, swapped[1]);
CRYPTO_store_u64_be(Xi + 8, swapped[0]);
}
void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len) {
uint64_t swapped[2];
swapped[0] = CRYPTO_load_u64_be(Xi + 8);
swapped[1] = CRYPTO_load_u64_be(Xi);
while (len >= 16) {
swapped[0] ^= CRYPTO_load_u64_be(inp + 8);
swapped[1] ^= CRYPTO_load_u64_be(inp);
gcm_polyval_nohw(swapped, &Htable[0]);
inp += 16;
len -= 16;
}
CRYPTO_store_u64_be(Xi, swapped[1]);
CRYPTO_store_u64_be(Xi + 8, swapped[0]);
}

View File

@@ -0,0 +1,221 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <stdio.h>
#include <string.h>
#include <vector>
#include <gtest/gtest.h>
#include <openssl/aes.h>
#include "../../internal.h"
#include "../../test/abi_test.h"
#include "../../test/file_test.h"
#include "../../test/test_util.h"
#include "../aes/internal.h"
#include "../cpucap/internal.h"
#include "internal.h"
TEST(GCMTest, TestVectors) {
FileTestGTest("crypto/fipsmodule/modes/gcm_tests.txt", [](FileTest *t) {
std::vector<uint8_t> key, plaintext, additional_data, nonce, ciphertext,
tag;
ASSERT_TRUE(t->GetBytes(&key, "Key"));
ASSERT_TRUE(t->GetBytes(&plaintext, "Plaintext"));
ASSERT_TRUE(t->GetBytes(&additional_data, "AdditionalData"));
ASSERT_TRUE(t->GetBytes(&nonce, "Nonce"));
ASSERT_TRUE(t->GetBytes(&ciphertext, "Ciphertext"));
ASSERT_TRUE(t->GetBytes(&tag, "Tag"));
ASSERT_EQ(plaintext.size(), ciphertext.size());
ASSERT_TRUE(key.size() == 16 || key.size() == 24 || key.size() == 32);
ASSERT_EQ(16u, tag.size());
std::vector<uint8_t> out(plaintext.size());
AES_KEY aes_key;
ASSERT_EQ(0, AES_set_encrypt_key(key.data(), key.size() * 8, &aes_key));
GCM128_CONTEXT ctx;
OPENSSL_memset(&ctx, 0, sizeof(ctx));
CRYPTO_gcm128_init_key(&ctx.gcm_key, &aes_key, AES_encrypt, 0);
CRYPTO_gcm128_setiv(&ctx, &aes_key, nonce.data(), nonce.size());
if (!additional_data.empty()) {
CRYPTO_gcm128_aad(&ctx, additional_data.data(), additional_data.size());
}
if (!plaintext.empty()) {
CRYPTO_gcm128_encrypt(&ctx, &aes_key, plaintext.data(), out.data(),
plaintext.size());
}
std::vector<uint8_t> got_tag(tag.size());
CRYPTO_gcm128_tag(&ctx, got_tag.data(), got_tag.size());
EXPECT_EQ(Bytes(tag), Bytes(got_tag));
EXPECT_EQ(Bytes(ciphertext), Bytes(out));
CRYPTO_gcm128_setiv(&ctx, &aes_key, nonce.data(), nonce.size());
OPENSSL_memset(out.data(), 0, out.size());
if (!additional_data.empty()) {
CRYPTO_gcm128_aad(&ctx, additional_data.data(), additional_data.size());
}
if (!ciphertext.empty()) {
CRYPTO_gcm128_decrypt(&ctx, &aes_key, ciphertext.data(), out.data(),
ciphertext.size());
}
ASSERT_TRUE(CRYPTO_gcm128_finish(&ctx, tag.data(), tag.size()));
EXPECT_EQ(Bytes(plaintext), Bytes(out));
});
}
TEST(GCMTest, ByteSwap) {
EXPECT_EQ(0x04030201u, CRYPTO_bswap4(0x01020304u));
EXPECT_EQ(UINT64_C(0x0807060504030201),
CRYPTO_bswap8(UINT64_C(0x0102030405060708)));
}
#if defined(SUPPORTS_ABI_TEST) && !defined(OPENSSL_NO_ASM) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
TEST(GCMTest, ABI) {
static const uint64_t kH[2] = {
UINT64_C(0x66e94bd4ef8a2c3b),
UINT64_C(0x884cfa59ca342b2e),
};
static const size_t kBlockCounts[] = {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 31, 32};
uint8_t buf[16 * 32];
OPENSSL_memset(buf, 42, sizeof(buf));
uint8_t X[16] = {0x92, 0xa3, 0xb3, 0x60, 0xce, 0xda, 0x88, 0x03,
0x78, 0xfe, 0xb2, 0x71, 0xb9, 0xc2, 0x28, 0xf3};
alignas(16) u128 Htable[16];
#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
if (CRYPTO_is_SSSE3_capable()) {
CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(gcm_ghash_ssse3, X, Htable, buf, 16 * blocks);
}
}
if (crypto_gcm_clmul_enabled()) {
CHECK_ABI_SEH(gcm_init_clmul, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_clmul, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(gcm_ghash_clmul, X, Htable, buf, 16 * blocks);
}
#if defined(GHASH_ASM_X86_64)
if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
CHECK_ABI_SEH(gcm_init_avx, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_avx, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(gcm_ghash_avx, X, Htable, buf, 16 * blocks);
}
if (hwaes_capable()) {
AES_KEY aes_key;
static const uint8_t kKey[16] = {0};
uint8_t iv[16] = {0};
aes_hw_set_encrypt_key(kKey, 128, &aes_key);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16, &aes_key, iv,
Htable, X);
CHECK_ABI_SEH(aesni_gcm_encrypt, buf, buf, blocks * 16 + 7, &aes_key,
iv, Htable, X);
}
aes_hw_set_decrypt_key(kKey, 128, &aes_key);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16, &aes_key, iv,
Htable, X);
CHECK_ABI_SEH(aesni_gcm_decrypt, buf, buf, blocks * 16 + 7, &aes_key,
iv, Htable, X);
}
}
}
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (crypto_gcm_avx512_enabled()) {
CHECK_ABI_SEH(gcm_init_avx512, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_avx512, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(gcm_ghash_avx512, X, Htable, buf, 16 * blocks);
}
if (hwaes_capable()) {
AES_KEY aes_key;
static const uint8_t kKey[16] = {0};
// aes_gcm_*_avx512 makes assumptions about |GCM128_CONTEXT|'s layout.
GCM128_CONTEXT gcm;
memset(&gcm, 0, sizeof(gcm));
memcpy(&gcm.gcm_key.Htable, Htable, sizeof(Htable));
memcpy(&gcm.Xi, X, sizeof(X));
uint8_t iv[16] = {0};
aes_hw_set_encrypt_key(kKey, 128, &aes_key);
CHECK_ABI_SEH(gcm_setiv_avx512, &aes_key, &gcm, iv, sizeof(iv));
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(aes_gcm_encrypt_avx512, &aes_key, &gcm, &gcm.mres, buf,
blocks * 16, buf);
}
aes_hw_set_decrypt_key(kKey, 128, &aes_key);
for (size_t blocks : kBlockCounts) {
CHECK_ABI_SEH(aes_gcm_decrypt_avx512, &aes_key, &gcm, &gcm.mres, buf,
blocks * 16, buf);
}
}
}
#endif // !MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
#endif // GHASH_ASM_X86_64
}
#endif // GHASH_ASM_X86 || GHASH_ASM_X86_64
#if defined(GHASH_ASM_ARM)
if (gcm_neon_capable()) {
CHECK_ABI(gcm_init_neon, Htable, kH);
CHECK_ABI(gcm_gmult_neon, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI(gcm_ghash_neon, X, Htable, buf, 16 * blocks);
}
}
if (gcm_pmull_capable()) {
CHECK_ABI(gcm_init_v8, Htable, kH);
CHECK_ABI(gcm_gmult_v8, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI(gcm_ghash_v8, X, Htable, buf, 16 * blocks);
}
}
#endif // GHASH_ASM_ARM
#if defined(OPENSSL_AARCH64) && defined(HW_GCM)
if (hwaes_capable() && gcm_pmull_capable()) {
static const uint8_t kKey[256/8] = {0};
uint8_t iv[16] = {0};
for (size_t key_bits = 128; key_bits <= 256; key_bits += 64) {
AES_KEY aes_key;
aes_hw_set_encrypt_key(kKey, key_bits, &aes_key);
CHECK_ABI(aes_gcm_enc_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key,
Htable);
CHECK_ABI(aes_gcm_dec_kernel, buf, sizeof(buf) * 8, buf, X, iv, &aes_key,
Htable);
}
}
#endif
#if defined(GHASH_ASM_PPC64LE)
if (CRYPTO_is_PPC64LE_vcrypto_capable()) {
CHECK_ABI(gcm_init_p8, Htable, kH);
CHECK_ABI(gcm_gmult_p8, X, Htable);
for (size_t blocks : kBlockCounts) {
CHECK_ABI(gcm_ghash_p8, X, Htable, buf, 16 * blocks);
}
}
#endif // GHASH_ASM_PPC64LE
}
#endif // SUPPORTS_ABI_TEST && !OPENSSL_NO_ASM && !MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX

View File

@@ -0,0 +1,456 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#ifndef OPENSSL_HEADER_MODES_INTERNAL_H
#define OPENSSL_HEADER_MODES_INTERNAL_H
#include <openssl/base.h>
#include <openssl/aes.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "../../internal.h"
#include "../cpucap/internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
// The maximum permitted number of cipher blocks per data unit in XTS mode.
// Reference IEEE Std 1619-2018.
#define XTS_MAX_BLOCKS_PER_DATA_UNIT (1<<20)
// block128_f is the type of an AES block cipher implementation.
//
// Unlike upstream OpenSSL, it and the other functions in this file hard-code
// |AES_KEY|. It is undefined in C to call a function pointer with anything
// other than the original type. Thus we either must match |block128_f| to the
// type signature of |AES_encrypt| and friends or pass in |void*| wrapper
// functions.
//
// These functions are called exclusively with AES, so we use the former.
typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16],
const AES_KEY *key);
OPENSSL_INLINE void CRYPTO_xor16(uint8_t out[16], const uint8_t a[16],
const uint8_t b[16]) {
// TODO(davidben): Ideally we'd leave this to the compiler, which could use
// vector registers, etc. But the compiler doesn't know that |in| and |out|
// cannot partially alias. |restrict| is slightly two strict (we allow exact
// aliasing), but perhaps in-place could be a separate function?
OPENSSL_STATIC_ASSERT(16 % sizeof(crypto_word_t) == 0,
block_cannot_be_evenly_divided_into_crypto_word_t)
for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) {
CRYPTO_store_word_le(
out + i, CRYPTO_load_word_le(a + i) ^ CRYPTO_load_word_le(b + i));
}
}
// CTR.
// ctr128_f is the type of a function that performs CTR-mode encryption.
typedef void (*ctr128_f)(const uint8_t *in, uint8_t *out, size_t blocks,
const AES_KEY *key, const uint8_t ivec[16]);
// CRYPTO_ctr128_encrypt encrypts (or decrypts, it's the same in CTR mode)
// |len| bytes from |in| to |out| using |block| in counter mode. There's no
// requirement that |len| be a multiple of any value and any partial blocks are
// stored in |ecount_buf| and |*num|, which must be zeroed before the initial
// call. The counter is a 128-bit, big-endian value in |ivec| and is
// incremented by this function.
void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t ecount_buf[16], unsigned *num,
block128_f block);
// CRYPTO_ctr128_encrypt_ctr32 acts like |CRYPTO_ctr128_encrypt| but takes
// |ctr|, a function that performs CTR mode but only deals with the lower 32
// bits of the counter. This is useful when |ctr| can be an optimised
// function.
void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
uint8_t ecount_buf[16], unsigned *num,
ctr128_f ctr);
// GCM.
//
// This API differs from the upstream API slightly. The |GCM128_CONTEXT| does
// not have a |key| pointer that points to the key as upstream's version does.
// Instead, every function takes a |key| parameter. This way |GCM128_CONTEXT|
// can be safely copied. Additionally, |gcm_key| is split into a separate
// struct.
typedef struct { uint64_t hi,lo; } u128;
// gmult_func multiplies |Xi| by the GCM key and writes the result back to
// |Xi|.
typedef void (*gmult_func)(uint8_t Xi[16], const u128 Htable[16]);
// ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from
// |inp|. The result is written back to |Xi| and the |len| argument must be a
// multiple of 16.
typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *inp, size_t len);
typedef struct gcm128_key_st {
// |gcm_*_ssse3| require a 16-byte-aligned |Htable| when hashing data, but not
// initialization. |GCM128_KEY| is not itself aligned to simplify embedding in
// |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16.
// TODO(crbug.com/boringssl/604): Revisit this.
u128 Htable[16];
gmult_func gmult;
ghash_func ghash;
block128_f block;
// use_hw_gcm_crypt is true if this context should use platform-specific
// assembly to process GCM data.
unsigned use_hw_gcm_crypt:1;
} GCM128_KEY;
// GCM128_CONTEXT contains state for a single GCM operation. The structure
// should be zero-initialized before use.
typedef struct {
// The following 5 names follow names in GCM specification
uint8_t Yi[16];
uint8_t EKi[16];
uint8_t EK0[16];
struct {
uint64_t aad;
uint64_t msg;
} len;
uint8_t Xi[16];
// |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
// TODO(crbug.com/boringssl/604): Revisit this.
alignas(16) GCM128_KEY gcm_key;
unsigned mres, ares;
} GCM128_CONTEXT;
typedef struct xts128_context {
AES_KEY *key1, *key2;
block128_f block1, block2;
} XTS128_CONTEXT;
typedef struct {
union {
double align;
AES_KEY ks;
} ks1, ks2; // AES key schedules to use
XTS128_CONTEXT xts;
} EVP_AES_XTS_CTX;
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
// crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is
// used.
int crypto_gcm_clmul_enabled(void);
// crypto_gcm_avx512_enabled returns one if the AVX512 VAES + VPCLMULQDQ
// implementation of GCM is used.
int crypto_gcm_avx512_enabled(void);
#endif
// CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to
// |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware
// accelerated) functions for performing operations in the GHASH field. If the
// AVX implementation was used |*out_is_avx| will be true.
void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash,
u128 out_table[16], int *out_is_avx,
const uint8_t gcm_key[16]);
// CRYPTO_gcm128_init_key initialises |gcm_key| to use |block| (typically AES)
// with the given key. |block_is_hwaes| is one if |block| is |aes_hw_encrypt|.
OPENSSL_EXPORT void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key,
const AES_KEY *key, block128_f block,
int block_is_hwaes);
// CRYPTO_gcm128_setiv sets the IV (nonce) for |ctx|. The |key| must be the
// same key that was passed to |CRYPTO_gcm128_init|.
OPENSSL_EXPORT void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key,
const uint8_t *iv, size_t iv_len);
// CRYPTO_gcm128_aad sets the authenticated data for an instance of GCM.
// This must be called before and data is encrypted. It returns one on success
// and zero otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad,
size_t len);
// CRYPTO_gcm128_encrypt encrypts |len| bytes from |in| to |out|. The |key|
// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one
// on success and zero otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
const AES_KEY *key, const uint8_t *in,
uint8_t *out, size_t len);
// CRYPTO_gcm128_decrypt decrypts |len| bytes from |in| to |out|. The |key|
// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one
// on success and zero otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
const AES_KEY *key, const uint8_t *in,
uint8_t *out, size_t len);
// CRYPTO_gcm128_encrypt_ctr32 encrypts |len| bytes from |in| to |out| using
// a CTR function that only handles the bottom 32 bits of the nonce, like
// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was
// passed to |CRYPTO_gcm128_init|. It returns one on success and zero
// otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
const AES_KEY *key,
const uint8_t *in, uint8_t *out,
size_t len, ctr128_f stream);
// CRYPTO_gcm128_decrypt_ctr32 decrypts |len| bytes from |in| to |out| using
// a CTR function that only handles the bottom 32 bits of the nonce, like
// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was
// passed to |CRYPTO_gcm128_init|. It returns one on success and zero
// otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
const AES_KEY *key,
const uint8_t *in, uint8_t *out,
size_t len, ctr128_f stream);
// CRYPTO_gcm128_finish calculates the authenticator and compares it against
// |len| bytes of |tag|. It returns one on success and zero otherwise.
OPENSSL_EXPORT int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag,
size_t len);
// CRYPTO_gcm128_tag calculates the authenticator and copies it into |tag|.
// The minimum of |len| and 16 bytes are copied into |tag|.
OPENSSL_EXPORT void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t *tag,
size_t len);
// GCM assembly.
void gcm_init_nohw(u128 Htable[16], const uint64_t H[2]);
void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len);
#if !defined(OPENSSL_NO_ASM)
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
#define GCM_FUNCREF
void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len);
// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
// 16-byte-aligned, but |gcm_init_ssse3| does not.
void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_ssse3(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_ssse3(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
size_t len);
#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
#define GHASH_ASM_X86_64
void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_avx(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_avx(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
size_t len);
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
void gcm_init_avx512(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_avx512(uint8_t Xi[2], const u128 Htable[16]);
void gcm_ghash_avx512(uint8_t Xi[2], const u128 Htable[16], const uint8_t *in,
size_t len);
#endif
#define HW_GCM
size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
const u128 Htable[16], uint8_t Xi[16]);
size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
const u128 Htable[16], uint8_t Xi[16]);
void gcm_setiv_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
const uint8_t *iv, size_t ivlen);
void aes_gcm_encrypt_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
unsigned *pblocklen, const uint8_t *in, size_t len,
uint8_t *out);
void aes_gcm_decrypt_avx512(const AES_KEY *key, const GCM128_CONTEXT *ctx,
unsigned *pblocklen, const uint8_t *in, size_t len,
uint8_t *out);
#endif // OPENSSL_X86_64 && !MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX
#if defined(OPENSSL_X86)
#define GHASH_ASM_X86
#endif // OPENSSL_X86
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#define GHASH_ASM_ARM
#define GCM_FUNCREF
OPENSSL_INLINE int gcm_pmull_capable(void) {
return CRYPTO_is_ARMv8_PMULL_capable();
}
void gcm_init_v8(u128 Htable[16], const uint64_t H[2]);
void gcm_gmult_v8(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len);
OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); }
void gcm_init_neon(u128 Htable[16], const uint64_t H[2]);
void gcm_gmult_neon(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_neon(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len);
#if defined(OPENSSL_AARCH64)
#define HW_GCM
// Note that in the argument list of the following functions,
// - the length is provided in bits (not bytes)
// - the order of arguments is different from that of |aesni_gcm_encrypt|.
// These functions are defined in aesv8-gcm-armv8.pl.
void aes_gcm_enc_kernel(const uint8_t *in, uint64_t in_bits, void *out,
void *Xi, uint8_t *ivec, const AES_KEY *key,
const u128 Htable[16]);
void aes_gcm_dec_kernel(const uint8_t *in, uint64_t in_bits, void *out,
void *Xi, uint8_t *ivec, const AES_KEY *key,
const u128 Htable[16]);
// These functions are defined in aesv8-gcm-armv8-unroll8.pl.
// They take input length in BITS and return number of BYTES processed.
size_t aesv8_gcm_8x_enc_128(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
size_t aesv8_gcm_8x_dec_128(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
size_t aesv8_gcm_8x_enc_192(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
size_t aesv8_gcm_8x_dec_192(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
size_t aesv8_gcm_8x_enc_256(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
size_t aesv8_gcm_8x_dec_256(const uint8_t *in, size_t bit_len, uint8_t *out,
uint8_t *Xi, uint8_t ivec[16], const AES_KEY *key,
const u128 Htable[16]);
#endif
#elif defined(OPENSSL_PPC64LE)
#define GHASH_ASM_PPC64LE
#define GCM_FUNCREF
void gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_p8(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_p8(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *inp, size_t len);
#endif
#endif // OPENSSL_NO_ASM
// CBC.
// cbc128_f is the type of a function that performs CBC-mode encryption.
typedef void (*cbc128_f)(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16], int enc);
// CRYPTO_cbc128_encrypt encrypts |len| bytes from |in| to |out| using the
// given IV and block cipher in CBC mode. The input need not be a multiple of
// 128 bits long, but the output will round up to the nearest 128 bit multiple,
// zero padding the input if needed. The IV will be updated on return.
void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
block128_f block);
// CRYPTO_cbc128_decrypt decrypts |len| bytes from |in| to |out| using the
// given IV and block cipher in CBC mode. If |len| is not a multiple of 128
// bits then only that many bytes will be written, but a multiple of 128 bits
// is always read from |in|. The IV will be updated on return.
void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
block128_f block);
// OFB.
// CRYPTO_ofb128_encrypt encrypts (or decrypts, it's the same with OFB mode)
// |len| bytes from |in| to |out| using |block| in OFB mode. There's no
// requirement that |len| be a multiple of any value and any partial blocks are
// stored in |ivec| and |*num|, the latter must be zero before the initial
// call.
void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16], unsigned *num,
block128_f block);
// CFB.
// CRYPTO_cfb128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
// from |in| to |out| using |block| in CFB mode. There's no requirement that
// |len| be a multiple of any value and any partial blocks are stored in |ivec|
// and |*num|, the latter must be zero before the initial call.
void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16], unsigned *num,
int enc, block128_f block);
// CRYPTO_cfb128_8_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
// from |in| to |out| using |block| in CFB-8 mode. Prior to the first call
// |num| should be set to zero.
void CRYPTO_cfb128_8_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
unsigned *num, int enc, block128_f block);
// CRYPTO_cfb128_1_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
// from |in| to |out| using |block| in CFB-1 mode. Prior to the first call
// |num| should be set to zero.
void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits,
const AES_KEY *key, uint8_t ivec[16],
unsigned *num, int enc, block128_f block);
size_t CRYPTO_cts128_encrypt_block(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16],
block128_f block);
// XTS.
// CRYPTO_xts128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes
// from |in| to |out| using the given IV in XTS mode. There's no requirement
// that |len| be a multiple of any value.
size_t CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
const uint8_t iv[16], const uint8_t *inp,
uint8_t *out, size_t len, int enc);
// POLYVAL.
//
// POLYVAL is a polynomial authenticator that operates over a field very
// similar to the one that GHASH uses. See
// https://www.rfc-editor.org/rfc/rfc8452.html#section-3.
struct polyval_ctx {
uint8_t S[16];
// |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
// TODO(crbug.com/boringssl/604): Revisit this.
alignas(16) u128 Htable[16];
gmult_func gmult;
ghash_func ghash;
};
// CRYPTO_POLYVAL_init initialises |ctx| using |key|.
void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]);
// CRYPTO_POLYVAL_update_blocks updates the accumulator in |ctx| given the
// blocks from |in|. Only a whole number of blocks can be processed so |in_len|
// must be a multiple of 16.
void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in,
size_t in_len);
// CRYPTO_POLYVAL_finish writes the accumulator from |ctx| to |out|.
void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]);
#if defined(__cplusplus)
} // extern C
#endif
#endif // OPENSSL_HEADER_MODES_INTERNAL_H

View File

@@ -0,0 +1,45 @@
// Copyright (c) 2008 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/type_check.h>
#include <assert.h>
#include <string.h>
#include "internal.h"
OPENSSL_STATIC_ASSERT(16 % sizeof(size_t) == 0,
ofb_block_cannot_be_divided_into_size_t)
void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len,
const AES_KEY *key, uint8_t ivec[16], unsigned *num,
block128_f block) {
assert(key != NULL && ivec != NULL && num != NULL);
assert(len == 0 || (in != NULL && out != NULL));
unsigned n = *num;
while (n && len) {
*(out++) = *(in++) ^ ivec[n];
--len;
n = (n + 1) % 16;
}
while (len >= 16) {
(*block)(ivec, ivec, key);
CRYPTO_xor16(out, in, ivec);
len -= 16;
out += 16;
in += 16;
n = 0;
}
if (len) {
(*block)(ivec, ivec, key);
while (len--) {
out[n] = in[n] ^ ivec[n];
++n;
}
}
*num = n;
}

View File

@@ -0,0 +1,79 @@
// Copyright (c) 2016, Google Inc.
// SPDX-License-Identifier: ISC
#include <openssl/base.h>
#include <assert.h>
#include <string.h>
#include "internal.h"
#include "../../internal.h"
// byte_reverse reverses the order of the bytes in |b->c|.
static void byte_reverse(uint8_t b[16]) {
uint64_t hi = CRYPTO_load_u64_le(b);
uint64_t lo = CRYPTO_load_u64_le(b + 8);
CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo));
CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi));
}
// reverse_and_mulX_ghash interprets |b| as a reversed element of the GHASH
// field, multiplies that by 'x' and serialises the result back into |b|, but
// with GHASH's backwards bit ordering.
static void reverse_and_mulX_ghash(uint8_t b[16]) {
uint64_t hi = CRYPTO_load_u64_le(b);
uint64_t lo = CRYPTO_load_u64_le(b + 8);
const crypto_word_t carry = constant_time_eq_w(hi & 1, 1);
hi >>= 1;
hi |= lo << 63;
lo >>= 1;
lo ^= ((uint64_t) constant_time_select_w(carry, 0xe1, 0)) << 56;
CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo));
CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi));
}
// POLYVAL(H, X_1, ..., X_n) =
// ByteReverse(GHASH(mulX_GHASH(ByteReverse(H)), ByteReverse(X_1), ...,
// ByteReverse(X_n))).
//
// See https://www.rfc-editor.org/rfc/rfc8452.html#appendix-A.
void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]) {
alignas(8) uint8_t H[16];
OPENSSL_memcpy(H, key, 16);
reverse_and_mulX_ghash(H);
int is_avx;
CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, &is_avx, H);
OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S));
}
void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in,
size_t in_len) {
assert((in_len & 15) == 0);
alignas(8) uint8_t buf[32 * 16];
while (in_len > 0) {
size_t todo = in_len;
if (todo > sizeof(buf)) {
todo = sizeof(buf);
}
OPENSSL_memcpy(buf, in, todo);
in += todo;
in_len -= todo;
size_t blocks = todo / 16;
for (size_t i = 0; i < blocks; i++) {
byte_reverse(buf + 16 * i);
}
ctx->ghash(ctx->S, ctx->Htable, buf, todo);
}
}
void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]) {
OPENSSL_memcpy(out, &ctx->S, 16);
byte_reverse(out);
}

View File

@@ -0,0 +1,122 @@
// Copyright (c) 2011 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/evp.h>
#include <string.h>
#include <openssl/aes.h>
#include <openssl/cipher.h>
#include <openssl/err.h>
#include "internal.h"
#include "../../internal.h"
size_t CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
const uint8_t iv[16], const uint8_t *inp,
uint8_t *out, size_t len, int enc) {
union {
uint64_t u[2];
uint8_t c[16];
} tweak, scratch;
unsigned int i;
if (len < 16) return 0;
OPENSSL_memcpy(tweak.c, iv, 16);
(*ctx->block2)(tweak.c, tweak.c, ctx->key2);
if (!enc && (len % 16)) len -= 16;
while (len >= 16) {
OPENSSL_memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
OPENSSL_memcpy(out, scratch.c, 16);
inp += 16;
out += 16;
len -= 16;
if (len == 0) return 1;
unsigned int carry, res;
#if defined(OPENSSL_BIG_ENDIAN)
uint64_t tweak_u0, tweak_u1;
tweak_u0 = CRYPTO_load_u64_le(&tweak.u[0]);
tweak_u1 = CRYPTO_load_u64_le(&tweak.u[1]);
res = 0x87 & (((int64_t)tweak_u1) >> 63);
carry = (unsigned int)(tweak_u0 >> 63);
tweak_u0 = (tweak_u0 << 1) ^ res;
tweak_u1 = (tweak_u1 << 1) | carry;
CRYPTO_store_u64_le(&tweak.u[0], tweak_u0);
CRYPTO_store_u64_le(&tweak.u[1], tweak_u1);
#else
res = 0x87 & (((int64_t)tweak.u[1]) >> 63);
carry = (unsigned int)(tweak.u[0] >> 63);
tweak.u[0] = (tweak.u[0] << 1) ^ res;
tweak.u[1] = (tweak.u[1] << 1) | carry;
#endif
}
if (enc) {
for (i = 0; i < len; ++i) {
uint8_t c = inp[i];
out[i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
OPENSSL_memcpy(out - 16, scratch.c, 16);
} else {
union {
uint64_t u[2];
uint8_t c[16];
} tweak1;
unsigned int carry, res;
#if defined(OPENSSL_BIG_ENDIAN)
uint64_t tweak_u0, tweak_u1;
tweak_u0 = CRYPTO_load_u64_le(&tweak.u[0]);
tweak_u1 = CRYPTO_load_u64_le(&tweak.u[1]);
res = 0x87 & (((int64_t)tweak_u1) >> 63);
carry = (unsigned int)(tweak_u0 >> 63);
tweak_u0 = (tweak_u0 << 1) ^ res;
tweak_u1 = (tweak_u1 << 1) | carry;
CRYPTO_store_u64_le(&tweak1.u[0], tweak_u0);
CRYPTO_store_u64_le(&tweak1.u[1], tweak_u1);
#else
res = 0x87 & (((int64_t)tweak.u[1]) >> 63);
carry = (unsigned int)(tweak.u[0] >> 63);
tweak1.u[0] = (tweak.u[0] << 1) ^ res;
tweak1.u[1] = (tweak.u[1] << 1) | carry;
#endif
OPENSSL_memcpy(scratch.c, inp, 16);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak1.u[0];
scratch.u[1] ^= tweak1.u[1];
for (i = 0; i < len; ++i) {
uint8_t c = inp[16 + i];
out[16 + i] = scratch.c[i];
scratch.c[i] = c;
}
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
scratch.u[0] ^= tweak.u[0];
scratch.u[1] ^= tweak.u[1];
OPENSSL_memcpy(out, scratch.c, 16);
}
return 1;
}

File diff suppressed because it is too large Load Diff