chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,358 @@
#!/usr/bin/env perl
# Copyright 2017-2022 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# Keccak-1600 for ARMv8.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT implementation. It makes no
# sense to attempt SIMD/NEON implementation for following reason.
# 64-bit lanes of vector registers can't be addressed as easily as in
# 32-bit mode. This means that 64-bit NEON is bound to be slower than
# 32-bit NEON, and this implementation is faster than 32-bit NEON on
# same processor. Even though it takes more scalar xor's and andn's,
# it gets compensated by availability of rotate. Not to forget that
# most processors achieve higher issue rate with scalar instructions.
#
# February 2018.
#
# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
# variant with register permutation/rotation twist that allows to
# eliminate copies to temporary registers. If you look closely you'll
# notice that it uses only one lane of vector registers. The new
# instructions effectively facilitate parallel hashing, which we don't
# support [yet?]. But lowest-level core procedure is prepared for it.
# The inner round is 67 [vector] instructions, so it's not actually
# obvious that it will provide performance improvement [in serial
# hash] as long as vector instructions issue rate is limited to 1 per
# cycle...
#
# July 2025
#
# Removed SHA3 variant, restricted assembly to core Keccak permutation.
#
######################################################################
# Numbers are cycles per processed byte.
#
# r=1088(*)
#
# Cortex-A53 13
# Cortex-A57 12
# X-Gene 14
# Mongoose 10
# Kryo 12
# Denver 7.8
# Apple A7 7.2
# ThunderX2 9.7
#
# (*) Corresponds to SHA3-256. No improvement coefficients are listed
# because they vary too much from compiler to compiler. Newer
# compiler does much better and improvement varies from 5% on
# Cortex-A57 to 25% on Cortex-A53. While in comparison to older
# compiler this code is at least 2x faster...
# File keccak1600-armv8.pl is imported from OpenSSL.
# https://github.com/openssl/openssl/blob/479b9adb88b9050186c1e9fc94879906f378b14b/crypto/sha/asm/keccak1600-armv8.pl
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
my @subrhotates = ([ 64, 63, 2, 36, 37 ],
[ 28, 20, 58, 9, 44 ],
[ 61, 54, 21, 39, 25 ],
[ 23, 19, 49, 43, 56 ],
[ 46, 62, 3, 8, 50 ]);
$code.=<<___;
#include <openssl/arm_arch.h>
.text
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
.quad 0,0,0,0,0,0,0,0
.type iotas_hw,%object
iotas_hw:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas_hw,.-iotas_hw
___
{{{
my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
(0, 5, 10, 15, 20));
$A[3][3] = "x25"; # x18 is reserved
my @C = map("x$_", (26,27,28,30));
$code.=<<___;
.type KeccakF1600_int,%function
.align 5
KeccakF1600_int:
AARCH64_SIGN_LINK_REGISTER
adr $C[2],iotas_hw
stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
b .Loop
.align 4
.Loop:
////////////////////////////////////////// Theta
eor $C[0],$A[0][0],$A[1][0]
stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
eor $C[1],$A[0][1],$A[1][1]
eor $C[2],$A[0][2],$A[1][2]
eor $C[3],$A[0][3],$A[1][3]
___
$C[4]=$A[0][4];
$C[5]=$A[1][4];
$code.=<<___;
eor $C[4],$A[0][4],$A[1][4]
eor $C[0],$C[0],$A[2][0]
eor $C[1],$C[1],$A[2][1]
eor $C[2],$C[2],$A[2][2]
eor $C[3],$C[3],$A[2][3]
eor $C[4],$C[4],$A[2][4]
eor $C[0],$C[0],$A[3][0]
eor $C[1],$C[1],$A[3][1]
eor $C[2],$C[2],$A[3][2]
eor $C[3],$C[3],$A[3][3]
eor $C[4],$C[4],$A[3][4]
eor $C[0],$C[0],$A[4][0]
eor $C[2],$C[2],$A[4][2]
eor $C[1],$C[1],$A[4][1]
eor $C[3],$C[3],$A[4][3]
eor $C[4],$C[4],$A[4][4]
eor $C[5],$C[0],$C[2],ror#63
eor $A[0][1],$A[0][1],$C[5]
eor $A[1][1],$A[1][1],$C[5]
eor $A[2][1],$A[2][1],$C[5]
eor $A[3][1],$A[3][1],$C[5]
eor $A[4][1],$A[4][1],$C[5]
eor $C[5],$C[1],$C[3],ror#63
eor $C[2],$C[2],$C[4],ror#63
eor $C[3],$C[3],$C[0],ror#63
eor $C[4],$C[4],$C[1],ror#63
eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
eor $A[1][2],$A[1][2],$C[5]
eor $A[2][2],$A[2][2],$C[5]
eor $A[3][2],$A[3][2],$C[5]
eor $A[4][2],$A[4][2],$C[5]
eor $A[0][0],$A[0][0],$C[4]
eor $A[1][0],$A[1][0],$C[4]
eor $A[2][0],$A[2][0],$C[4]
eor $A[3][0],$A[3][0],$C[4]
eor $A[4][0],$A[4][0],$C[4]
___
$C[4]=undef;
$C[5]=undef;
$code.=<<___;
ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
eor $A[1][3],$A[1][3],$C[2]
eor $A[2][3],$A[2][3],$C[2]
eor $A[3][3],$A[3][3],$C[2]
eor $A[4][3],$A[4][3],$C[2]
eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
eor $A[1][4],$A[1][4],$C[3]
eor $A[2][4],$A[2][4],$C[3]
eor $A[3][4],$A[3][4],$C[3]
eor $A[4][4],$A[4][4],$C[3]
////////////////////////////////////////// Rho+Pi
mov $C[3],$A[0][1]
ror $A[0][1],$A[1][1],#$subrhotates[1][1]
//mov $C[1],$A[0][2]
ror $A[0][2],$A[2][2],#$subrhotates[2][2]
//mov $C[0],$A[0][3]
ror $A[0][3],$A[3][3],#$subrhotates[3][3]
//mov $C[2],$A[0][4]
ror $A[0][4],$A[4][4],#$subrhotates[4][4]
ror $A[1][1],$A[1][4],#$subrhotates[1][4]
ror $A[2][2],$A[2][3],#$subrhotates[2][3]
ror $A[3][3],$A[3][2],#$subrhotates[3][2]
ror $A[4][4],$A[4][1],#$subrhotates[4][1]
ror $A[1][4],$A[4][2],#$subrhotates[4][2]
ror $A[2][3],$A[3][4],#$subrhotates[3][4]
ror $A[3][2],$A[2][1],#$subrhotates[2][1]
ror $A[4][1],$A[1][3],#$subrhotates[1][3]
ror $A[4][2],$A[2][4],#$subrhotates[2][4]
ror $A[3][4],$A[4][3],#$subrhotates[4][3]
ror $A[2][1],$A[1][2],#$subrhotates[1][2]
ror $A[1][3],$A[3][1],#$subrhotates[3][1]
ror $A[2][4],$A[4][0],#$subrhotates[4][0]
ror $A[4][3],$A[3][0],#$subrhotates[3][0]
ror $A[1][2],$A[2][0],#$subrhotates[2][0]
ror $A[3][1],$A[1][0],#$subrhotates[1][0]
ror $A[1][0],$C[0],#$subrhotates[0][3]
ror $A[2][0],$C[3],#$subrhotates[0][1]
ror $A[3][0],$C[2],#$subrhotates[0][4]
ror $A[4][0],$C[1],#$subrhotates[0][2]
////////////////////////////////////////// Chi+Iota
bic $C[0],$A[0][2],$A[0][1]
bic $C[1],$A[0][3],$A[0][2]
bic $C[2],$A[0][0],$A[0][4]
bic $C[3],$A[0][1],$A[0][0]
eor $A[0][0],$A[0][0],$C[0]
bic $C[0],$A[0][4],$A[0][3]
eor $A[0][1],$A[0][1],$C[1]
ldr $C[1],[sp,#16]
eor $A[0][3],$A[0][3],$C[2]
eor $A[0][4],$A[0][4],$C[3]
eor $A[0][2],$A[0][2],$C[0]
ldr $C[3],[$C[1]],#8 // Iota[i++]
bic $C[0],$A[1][2],$A[1][1]
tst $C[1],#255 // are we done?
str $C[1],[sp,#16]
bic $C[1],$A[1][3],$A[1][2]
bic $C[2],$A[1][0],$A[1][4]
eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
bic $C[3],$A[1][1],$A[1][0]
eor $A[1][0],$A[1][0],$C[0]
bic $C[0],$A[1][4],$A[1][3]
eor $A[1][1],$A[1][1],$C[1]
eor $A[1][3],$A[1][3],$C[2]
eor $A[1][4],$A[1][4],$C[3]
eor $A[1][2],$A[1][2],$C[0]
bic $C[0],$A[2][2],$A[2][1]
bic $C[1],$A[2][3],$A[2][2]
bic $C[2],$A[2][0],$A[2][4]
bic $C[3],$A[2][1],$A[2][0]
eor $A[2][0],$A[2][0],$C[0]
bic $C[0],$A[2][4],$A[2][3]
eor $A[2][1],$A[2][1],$C[1]
eor $A[2][3],$A[2][3],$C[2]
eor $A[2][4],$A[2][4],$C[3]
eor $A[2][2],$A[2][2],$C[0]
bic $C[0],$A[3][2],$A[3][1]
bic $C[1],$A[3][3],$A[3][2]
bic $C[2],$A[3][0],$A[3][4]
bic $C[3],$A[3][1],$A[3][0]
eor $A[3][0],$A[3][0],$C[0]
bic $C[0],$A[3][4],$A[3][3]
eor $A[3][1],$A[3][1],$C[1]
eor $A[3][3],$A[3][3],$C[2]
eor $A[3][4],$A[3][4],$C[3]
eor $A[3][2],$A[3][2],$C[0]
bic $C[0],$A[4][2],$A[4][1]
bic $C[1],$A[4][3],$A[4][2]
bic $C[2],$A[4][0],$A[4][4]
bic $C[3],$A[4][1],$A[4][0]
eor $A[4][0],$A[4][0],$C[0]
bic $C[0],$A[4][4],$A[4][3]
eor $A[4][1],$A[4][1],$C[1]
eor $A[4][3],$A[4][3],$C[2]
eor $A[4][4],$A[4][4],$C[3]
eor $A[4][2],$A[4][2],$C[0]
bne .Loop
ldr x30,[sp,#24]
AARCH64_VALIDATE_LINK_REGISTER
ret
.size KeccakF1600_int,.-KeccakF1600_int
.globl KeccakF1600_hw
.type KeccakF1600_hw,%function
.align 5
KeccakF1600_hw:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#48
str x0,[sp,#32] // offload argument
mov $C[0],x0
ldp $A[0][0],$A[0][1],[x0,#16*0]
ldp $A[0][2],$A[0][3],[$C[0],#16*1]
ldp $A[0][4],$A[1][0],[$C[0],#16*2]
ldp $A[1][1],$A[1][2],[$C[0],#16*3]
ldp $A[1][3],$A[1][4],[$C[0],#16*4]
ldp $A[2][0],$A[2][1],[$C[0],#16*5]
ldp $A[2][2],$A[2][3],[$C[0],#16*6]
ldp $A[2][4],$A[3][0],[$C[0],#16*7]
ldp $A[3][1],$A[3][2],[$C[0],#16*8]
ldp $A[3][3],$A[3][4],[$C[0],#16*9]
ldp $A[4][0],$A[4][1],[$C[0],#16*10]
ldp $A[4][2],$A[4][3],[$C[0],#16*11]
ldr $A[4][4],[$C[0],#16*12]
bl KeccakF1600_int
ldr $C[0],[sp,#32]
stp $A[0][0],$A[0][1],[$C[0],#16*0]
stp $A[0][2],$A[0][3],[$C[0],#16*1]
stp $A[0][4],$A[1][0],[$C[0],#16*2]
stp $A[1][1],$A[1][2],[$C[0],#16*3]
stp $A[1][3],$A[1][4],[$C[0],#16*4]
stp $A[2][0],$A[2][1],[$C[0],#16*5]
stp $A[2][2],$A[2][3],[$C[0],#16*6]
stp $A[2][4],$A[3][0],[$C[0],#16*7]
stp $A[3][1],$A[3][2],[$C[0],#16*8]
stp $A[3][3],$A[3][4],[$C[0],#16*9]
stp $A[4][0],$A[4][1],[$C[0],#16*10]
stp $A[4][2],$A[4][3],[$C[0],#16*11]
str $A[4][4],[$C[0],#16*12]
ldp x19,x20,[x29,#16]
add sp,sp,#48
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.size KeccakF1600_hw,.-KeccakF1600_hw
___
}}}
$code.=<<___;
.asciz "Keccak-1600 permutation for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,718 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# sha1_block procedure for ARMv4.
#
# January 2007.
# Size/performance trade-off
# ====================================================================
# impl size in bytes comp cycles[*] measured performance
# ====================================================================
# thumb 304 3212 4420
# armv4-small 392/+29% 1958/+64% 2250/+96%
# armv4-compact 740/+89% 1552/+26% 1840/+22%
# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
# ====================================================================
# thumb = same as 'small' but in Thumb instructions[**] and
# with recurring code in two private functions;
# small = detached Xload/update, loops are folded;
# compact = detached Xload/update, 5x unroll;
# large = interleaved Xload/update, 5x unroll;
# full unroll = interleaved Xload/update, full unroll, estimated[!];
#
# [*] Manually counted instructions in "grand" loop body. Measured
# performance is affected by prologue and epilogue overhead,
# i-cache availability, branch penalties, etc.
# [**] While each Thumb instruction is twice smaller, they are not as
# diverse as ARM ones: e.g., there are only two arithmetic
# instructions with 3 arguments, no [fixed] rotate, addressing
# modes are limited. As result it takes more instructions to do
# the same job in Thumb, therefore the code is never twice as
# small and always slower.
# [***] which is also ~35% better than compiler generated code. Dual-
# issue Cortex A8 core was measured to process input block in
# ~990 cycles.
# August 2010.
#
# Rescheduling for dual-issue pipeline resulted in 13% improvement on
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
# September 2013.
#
# Add NEON implementation (see sha1-586.pl for background info). On
# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
# faster than integer-only code. Because [fully unrolled] NEON code
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
# byte, which is also >80% faster than integer-only code. Cortex-A15
# is even faster spending 5.6 cycles per byte outperforming integer-
# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0";
$inp="r1";
$len="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
$Xi="r14";
@V=($a,$b,$c,$d,$e);
sub Xupdate {
my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
$code.=<<___;
ldr $t0,[$Xi,#15*4]
ldr $t1,[$Xi,#13*4]
ldr $t2,[$Xi,#7*4]
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
#if __ARM_ARCH<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
str $t0,[$Xi,#-4]!
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_16_19 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2");
$code.=<<___;
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_20_39 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
$code.=<<___;
add $e,$e,$t1 @ E+=F_20_39(B,C,D)
___
}
sub BODY_40_59 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
$code.=<<___;
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
add $e,$e,$t2,ror#2
___
}
$code=<<___;
#include <openssl/arm_arch.h>
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.global sha1_block_data_order_nohw
.type sha1_block_data_order_nohw,%function
.align 5
sha1_block_data_order_nohw:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
.Lloop:
ldr $K,.LK_00_19
mov $Xi,sp
sub sp,sp,#15*4
mov $c,$c,ror#30
mov $d,$d,ror#30
mov $e,$e,ror#30 @ [6]
.L_00_15:
___
for($i=0;$i<5;$i++) {
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp
#endif
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_00_15(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
$code.=<<___;
ldr $K,.LK_20_39 @ [+15+16*4]
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
___
for($i=0;$i<5;$i++) {
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp @ preserve carry
#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
ldr $K,.LK_40_59
sub sp,sp,#20*4 @ [+2]
.L_40_59:
___
for($i=0;$i<5;$i++) {
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp
#endif
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
sub sp,sp,#20*4
cmp sp,#0 @ set carry to denote 60_79
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
.L_done:
add sp,sp,#80*4 @ "deallocate" stack frame
ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
add $a,$K,$a
add $b,$t0,$b
add $c,$t1,$c,ror#2
add $d,$t2,$d,ror#2
add $e,$t3,$e,ror#2
stmia $ctx,{$a,$b,$c,$d,$e}
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
#####################################################################
# NEON stuff
#
{{{
my @V=($a,$b,$c,$d,$e);
my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
my $Xi=4;
my @X=map("q$_",(8..11,0..3));
my @Tx=("q12","q13");
my ($K,$zero)=("q14","q15");
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub body_00_19 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&bic ($t0,$d,$b)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t1,$c,$b)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$t1,$t0)', # F_00_19
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_00_19
'$j++; unshift(@V,pop(@V));'
)
}
sub body_20_39 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&eor ($t0,$b,$d)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
'&eor ($t1,$t0,$c)', # F_20_39
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_20_39
'$j++; unshift(@V,pop(@V));'
)
}
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t0,$c,$d)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$c,$d)',
'&add ($e,$e,$t0)',
'&and ($t1,$t1,$b)',
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_40_59
'$j++; unshift(@V,pop(@V));'
)
}
sub Xupdate_16_31 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@Tx[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@Tx[0],@Tx[1],30);
eval(shift(@insns));
eval(shift(@insns));
&vshl_u32 (@Tx[1],@Tx[1],2);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xupdate_32_79 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@X[0],@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xuplast_80 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
&sub ($Xfer,$Xfer,64);
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
&it ("eq");
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[-4&7],@X[-4&7]);
foreach (@insns) { eval; } # remaining instructions
$Xi=0;
}
sub Xloop()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
foreach (@insns) { eval; }
$Xi++;
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha1_block_data_order_neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
sub $Xfer,sp,#64
adr $K_XX_XX,.LK_00_19
bic $Xfer,$Xfer,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
mov sp,$Xfer @ alloca
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
vrev32.8 @X[-2&7],@X[-2&7]
vadd.i32 @X[0],@X[-4&7],$K
vrev32.8 @X[-1&7],@X[-1&7]
vadd.i32 @X[1],@X[-3&7],$K
vst1.32 {@X[0]},[$Xfer,:128]!
vadd.i32 @X[2],@X[-2&7],$K
vst1.32 {@X[1]},[$Xfer,:128]!
vst1.32 {@X[2]},[$Xfer,:128]!
ldr $Ki,[sp] @ big RAW stall
.Loop_neon:
___
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_32_79(\&body_00_19);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_20_39);
&Xuplast_80(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
$code.=<<___;
ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
add $a,$a,$Ki
ldr $Ki,[$ctx,#16]
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
it eq
moveq sp,$saved_sp
add $e,$e,$Ki
it ne
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
itt ne
addne $Xfer,sp,#3*16
bne .Loop_neon
@ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif
___
}}}
#####################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xf,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
.global sha1_block_data_order_hw
.type sha1_block_data_order_hw,%function
.align 5
sha1_block_data_order_hw:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
adr r3,.LK_00_19
vld1.32 {$ABCD},[$ctx]!
vld1.32 {$E\[0]},[$ctx]
sub $ctx,$ctx,#16
vld1.32 {@Kxx[0]\[]},[r3,:32]!
vld1.32 {@Kxx[1]\[]},[r3,:32]!
vld1.32 {@Kxx[2]\[]},[r3,:32]!
vld1.32 {@Kxx[3]\[]},[r3,:32]
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vadd.i32 $W0,@Kxx[0],@MSG[0]
vrev32.8 @MSG[2],@MSG[2]
vmov $ABCD_SAVE,$ABCD @ offload
subs $len,$len,#1
vadd.i32 $W1,@Kxx[0],@MSG[1]
vrev32.8 @MSG[3],@MSG[3]
sha1h $E1,$ABCD @ 0
sha1c $ABCD,$E,$W0
vadd.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1$f $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1p $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD @ 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD @ 19
sha1p $ABCD,$E1,$W1
vadd.i32 $E,$E,$E0
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
bne .Loop_v8
vst1.32 {$ABCD},[$ctx]!
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
ret @ bx lr
.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
#endif
___
}}}
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
"sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
"sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
# this fix-up provides Thumb encoding in conjunction with INST
$word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
s/\bret\b/bx lr/o or
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
print $_,$/;
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,349 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# SHA1 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
# Cortex-A53 2.24 8.03 (+97%)
# Cortex-A57 2.35 7.88 (+74%)
# Denver 2.13 3.97 (+0%)(**)
# X-Gene 8.80 (+200%)
# Mongoose 2.05 6.50 (+160%)
# Kryo 1.88 8.00 (+90%)
#
# (*) Software results are presented mostly for reference purposes.
# (**) Keep in mind that Denver relies on binary translation, which
# optimizes compiler output at run-time.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@Xx=map("x$_",(3..17,19));
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
($t0,$t1,$t2,$K)=map("w$_",(25..28));
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i<15 && !($i&1));
lsr @Xx[$i+1],@Xx[$i],#32
___
$code.=<<___ if ($i<14 && !($i&1));
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
___
$code.=<<___ if ($i<14 && ($i&1));
#ifdef __AARCH64EB__
ror @Xx[$i+1],@Xx[$i+1],#32
#else
rev32 @Xx[$i+1],@Xx[$i+1]
#endif
___
$code.=<<___ if ($i<14);
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==19);
movz $K,#0xeba1
movk $K,#0x6ed9,lsl#16
___
$code.=<<___ if ($i>=14);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==59);
movz $K,#0xc1d6
movk $K,#0xca62,lsl#16
___
$code.=<<___;
orr $t0,$b,$c
and $t1,$b,$c
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
ror $t2,$a,#27
and $t0,$t0,$d
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $e,$e,$t2 // e+=rot(a,5)
orr $t0,$t0,$t1
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==39);
movz $K,#0xbcdc
movk $K,#0x8f1b,lsl#16
___
$code.=<<___ if ($i<78);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
$code.=<<___ if ($i==78);
ldp @Xw[1],@Xw[2],[$ctx]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==79);
ldp @Xw[3],@Xw[4],[$ctx,#8]
eor $t0,$d,$b
ror $t2,$a,#27
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
ldr @Xw[5],[$ctx,#16]
add $e,$e,$t0 // e+=F(b,c,d)
___
}
$code.=<<___;
#include <openssl/arm_arch.h>
.text
.globl sha1_block_data_order_nohw
.type sha1_block_data_order_nohw,%function
.align 6
sha1_block_data_order_nohw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldr $E,[$ctx,#16]
.Loop:
ldr @Xx[0],[$inp],#64
movz $K,#0x7999
sub $num,$num,#1
movk $K,#0x5a82,lsl#16
#ifdef __AARCH64EB__
ror $Xx[0],@Xx[0],#32
#else
rev32 @Xx[0],@Xx[0]
#endif
add $E,$E,$K // warm it up
add $E,$E,@Xw[0]
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $B,$B,@Xw[2]
add $C,$C,@Xw[3]
add $A,$A,@Xw[1]
add $D,$D,@Xw[4]
add $E,$E,@Xw[5]
stp $A,$B,[$ctx]
stp $C,$D,[$ctx,#8]
str $E,[$ctx,#16]
cbnz $num,.Loop
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
my @MSG=map("v$_.16b",(4..7));
my @Kxx=map("v$_.4s",(16..19));
my ($W0,$W1)=("v20.4s","v21.4s");
my $ABCD_SAVE="v22.16b";
$code.=<<___;
.globl sha1_block_data_order_hw
.type sha1_block_data_order_hw,%function
.align 6
sha1_block_data_order_hw:
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adrp x4,:pg_hi21:.Lconst
add x4,x4,:lo12:.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
sub $ctx,$ctx,#16
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
add.i32 $W0,@Kxx[0],@MSG[0]
rev32 @MSG[2],@MSG[2]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
add.i32 $W1,@Kxx[0],@MSG[1]
rev32 @MSG[3],@MSG[3]
sha1h $E1,$ABCD
sha1c $ABCD,$E,$W0 // 0
add.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1$f $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1p $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD // 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD // 19
sha1p $ABCD,$E1,$W1
add.i32 $E,$E,$E0
add.i32 $ABCD,$ABCD,$ABCD_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD},[$ctx],#16
st1.32 {$E}[0],[$ctx]
ldr x29,[sp],#16
ret
.size sha1_block_data_order_hw,.-sha1_block_data_order_hw
.section .rodata
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
}}}
{ my %opcode = (
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,744 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
# ifndef __ARMEB__
rev $t1,$t1
# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#ifndef __KERNEL__
# include <openssl/arm_arch.h>
#else
# define __ARM_ARCH __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
@ instructions are manually-encoded. (See unsha256.)
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
.align 5
.global sha256_block_data_order_nohw
.type sha256_block_data_order_nohw,%function
sha256_block_data_order_nohw:
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
adr $Ktbl,K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
#if __ARM_ARCH>=7
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.LK256_shortcut_neon:
@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
#if defined(__thumb2__)
.word K256-(.LK256_add_neon+4)
#else
.word K256-(.LK256_add_neon+8)
#endif
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 5
.skip 16
sha256_block_data_order_neon:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
@ K256 is just at the boundary of being easily referenced by an ADR from
@ this function. In Arm mode, when building with __ARM_ARCH=6, it does
@ not fit. By moving code around, we could make it fit, but this is too
@ fragile. For simplicity, just load the offset from
@ .LK256_shortcut_neon.
@
@ TODO(davidben): adrl would avoid a load, but clang-assembler does not
@ support it. We might be able to emulate it with a macro, but Android's
@ did not work when I tried it.
@ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm
ldr $Ktbl,.LK256_shortcut_neon
.LK256_add_neon:
add $Ktbl,pc,$Ktbl
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
######################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.LK256_shortcut_hw:
@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode.
#if defined(__thumb2__)
.word K256-(.LK256_add_hw+4)
#else
.word K256-(.LK256_add_hw+8)
#endif
.global sha256_block_data_order_hw
.type sha256_block_data_order_hw,%function
.align 5
sha256_block_data_order_hw:
@ K256 is too far to reference from one ADR command in Thumb mode. In
@ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte
@ boundary. For simplicity, just load the offset from .LK256_shortcut_hw.
ldr $Ktbl,.LK256_shortcut_hw
.LK256_add_hw:
add $Ktbl,pc,$Ktbl
vld1.32 {$ABCD,$EFGH},[$ctx]
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
.align 4
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for($i=0;$i<12;$i++) {
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
it ne
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,922 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# SHA512 block transform for x86. September 2007.
#
# May 2013.
#
# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm SIMD(*) x86_64(**)
# Pentium 100 97 61 - -
# PIII 75 77 56 - -
# P4 116 95 82 34.6 30.8
# AMD K8 54 55 36 20.7 9.57
# Core2 66 57 40 15.9 9.97
# Westmere 70 - 38 12.2 9.58
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
# Skylake 40 - 26 13.3 7.25
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
# Goldmont 80 - 48 19.5 12.0
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
# (***) paddq is incredibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
# though it does not use 128-bit operations. The latter means that
# SSE2-aware kernel is no longer required to execute the code. Another
# difference is that new code optimizes amount of writes, but at the
# cost of increased data cache "footprint" by 1/2KB.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output=$ARGV[1];
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=1;
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
$K512="ebp";
$Asse2=&QWP(0,"esp");
$Bsse2=&QWP(8,"esp");
$Csse2=&QWP(16,"esp");
$Dsse2=&QWP(24,"esp");
$Esse2=&QWP(32,"esp");
$Fsse2=&QWP(40,"esp");
$Gsse2=&QWP(48,"esp");
$Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
$BxC="mm2"; # ... except for B^C
sub BODY_00_15_sse2 {
my $phase=shift;
#&movq ("mm5",$Fsse2); # load f
#&movq ("mm6",$Gsse2); # load g
&movq ("mm1",$E); # %mm1 is sliding right
&pxor ("mm5","mm6"); # f^=g
&psrlq ("mm1",14);
&movq ($Esse2,$E); # modulo-scheduled save e
&pand ("mm5",$E); # f&=e
&psllq ($E,23); # $E is sliding left
&movq ($A,"mm3") if ($phase<2);
&movq (&QWP(8*9,"esp"),"mm7") # save X[i]
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm5","mm6"); # Ch(e,f,g)
&pxor ("mm3",$E);
&psllq ($E,23);
&pxor ("mm3","mm1");
&movq ($Asse2,$A); # modulo-scheduled save a
&paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g)
&pxor ("mm3",$E);
&psrlq ("mm1",23);
&paddq ("mm7",$Hsse2); # X[i]+=h
&pxor ("mm3","mm1");
&psllq ($E,4);
&paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i]
&pxor ("mm3",$E); # T1=Sigma1_512(e)
&movq ($E,$Dsse2); # e = load d, e in next round
&paddq ("mm3","mm7"); # T1+=X[i]
&movq ("mm5",$A); # %mm5 is sliding right
&psrlq ("mm5",28);
&paddq ($E,"mm3"); # d += T1
&movq ("mm6",$A); # %mm6 is sliding left
&movq ("mm7","mm5");
&psllq ("mm6",25);
&movq ("mm1",$Bsse2); # load b
&psrlq ("mm5",6);
&pxor ("mm7","mm6");
&sub ("esp",8);
&psllq ("mm6",5);
&pxor ("mm7","mm5");
&pxor ($A,"mm1"); # a^b, b^c in next round
&psrlq ("mm5",5);
&pxor ("mm7","mm6");
&pand ($BxC,$A); # (b^c)&(a^b)
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&pxor ($BxC,"mm1"); # [h=]Maj(a,b,c)
&pxor ("mm6","mm7"); # Sigma0_512(a)
&movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch
&movq ("mm5",$Fsse2) if ($phase==0); # load f
if ($phase>1) {
&paddq ($BxC,"mm6"); # h+=Sigma0(a)
&add ($K512,8);
#&paddq ($BxC,"mm3"); # h+=T1
($A,$BxC) = ($BxC,$A); # rotate registers
} else {
&paddq ("mm3",$BxC); # T1+=Maj(a,b,c)
&movq ($BxC,$A);
&add ($K512,8);
&paddq ("mm3","mm6"); # T1+=Sigma0(a)
&movq ("mm6",$Gsse2) if ($phase==0); # load g
#&movq ($A,"mm3"); # h=T1
}
}
sub BODY_00_15_x86 {
#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
# LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
# HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
&mov ("ecx",$Elo);
&mov ("edx",$Ehi);
&mov ("esi","ecx");
&shr ("ecx",9); # lo>>9
&mov ("edi","edx");
&shr ("edx",9); # hi>>9
&mov ("ebx","ecx");
&shl ("esi",14); # lo<<14
&mov ("eax","edx");
&shl ("edi",14); # hi<<14
&xor ("ebx","esi");
&shr ("ecx",14-9); # lo>>14
&xor ("eax","edi");
&shr ("edx",14-9); # hi>>14
&xor ("eax","ecx");
&shl ("esi",18-14); # lo<<18
&xor ("ebx","edx");
&shl ("edi",18-14); # hi<<18
&xor ("ebx","esi");
&shr ("ecx",18-14); # lo>>18
&xor ("eax","edi");
&shr ("edx",18-14); # hi>>18
&xor ("eax","ecx");
&shl ("esi",23-18); # lo<<23
&xor ("ebx","edx");
&shl ("edi",23-18); # hi<<23
&xor ("eax","esi");
&xor ("ebx","edi"); # T1 = Sigma1(e)
&mov ("ecx",$Flo);
&mov ("edx",$Fhi);
&mov ("esi",$Glo);
&mov ("edi",$Ghi);
&add ("eax",$Hlo);
&adc ("ebx",$Hhi); # T1 += h
&xor ("ecx","esi");
&xor ("edx","edi");
&and ("ecx",$Elo);
&and ("edx",$Ehi);
&add ("eax",&DWP(8*(9+15)+0,"esp"));
&adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
&xor ("ecx","esi");
&xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
&mov ("esi",&DWP(0,$K512));
&mov ("edi",&DWP(4,$K512)); # K[i]
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Ch(e,f,g)
&mov ("ecx",$Dlo);
&mov ("edx",$Dhi);
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += K[i]
&mov ($Tlo,"eax");
&mov ($Thi,"ebx"); # put T1 away
&add ("eax","ecx");
&adc ("ebx","edx"); # d += T1
#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
# LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
# HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ($Dlo,"eax");
&mov ($Dhi,"ebx");
&mov ("esi","ecx");
&shr ("ecx",2); # lo>>2
&mov ("edi","edx");
&shr ("edx",2); # hi>>2
&mov ("ebx","ecx");
&shl ("esi",4); # lo<<4
&mov ("eax","edx");
&shl ("edi",4); # hi<<4
&xor ("ebx","esi");
&shr ("ecx",7-2); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-2); # hi>>7
&xor ("ebx","ecx");
&shl ("esi",25-4); # lo<<25
&xor ("eax","edx");
&shl ("edi",25-4); # hi<<25
&xor ("eax","esi");
&shr ("ecx",28-7); # lo>>28
&xor ("ebx","edi");
&shr ("edx",28-7); # hi>>28
&xor ("eax","ecx");
&shl ("esi",30-25); # lo<<30
&xor ("ebx","edx");
&shl ("edi",30-25); # hi<<30
&xor ("eax","esi");
&xor ("ebx","edi"); # Sigma0(a)
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ("esi",$Blo);
&mov ("edi",$Bhi);
&add ("eax",$Tlo);
&adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
&or ("ecx","esi");
&or ("edx","edi");
&and ("ecx",$Clo);
&and ("edx",$Chi);
&and ("esi",$Alo);
&and ("edi",$Ahi);
&or ("ecx","esi");
&or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Maj(a,b,c)
&mov ($Tlo,"eax");
&mov ($Thi,"ebx");
&mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
&sub ("esp",8);
&lea ($K512,&DWP(8,$K512)); # K++
}
&function_begin("sha512_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov ("ebx","esp"); # saved sp
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($K512);
&lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
&sub ("esp",16);
&and ("esp",-64);
&shl ("eax",7);
&add ("eax","edi");
&mov (&DWP(0,"esp"),"esi"); # ctx
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&mov ("ecx",&DWP(0,"edx"));
&test ("ecx",1<<26);
&jz (&label("loop_x86"));
&mov ("edx",&DWP(4,"edx"));
# load ctx->h[0-7]
&movq ($A,&QWP(0,"esi"));
&and ("ecx",1<<24); # XMM registers availability
&movq ("mm1",&QWP(8,"esi"));
&and ("edx",1<<9); # SSSE3 bit
&movq ($BxC,&QWP(16,"esi"));
&or ("ecx","edx");
&movq ("mm3",&QWP(24,"esi"));
&movq ($E,&QWP(32,"esi"));
&movq ("mm5",&QWP(40,"esi"));
&movq ("mm6",&QWP(48,"esi"));
&movq ("mm7",&QWP(56,"esi"));
&cmp ("ecx",1<<24|1<<9);
&je (&label("SSSE3"));
&sub ("esp",8*10);
&jmp (&label("loop_sse2"));
&set_label("loop_sse2",16);
#&movq ($Asse2,$A);
&movq ($Bsse2,"mm1");
&movq ($Csse2,$BxC);
&movq ($Dsse2,"mm3");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&movq ($Gsse2,"mm6");
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&movq ("mm3",$A); # magic
&mov ("eax",&DWP(0,"edi"));
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&mov ("edx",15); # counter
&bswap ("eax");
&bswap ("ebx");
&jmp (&label("00_14_sse2"));
&set_label("00_14_sse2",16);
&movd ("mm1","eax");
&mov ("eax",&DWP(0,"edi"));
&movd ("mm7","ebx");
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("eax");
&bswap ("ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2();
&dec ("edx");
&jnz (&label("00_14_sse2"));
&movd ("mm1","eax");
&movd ("mm7","ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2(1);
&pxor ($A,$A); # A is in %mm3
&mov ("edx",32); # counter
&jmp (&label("16_79_sse2"));
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
&movq ("mm6","mm5");
&psrlq ("mm5",6);
&psllq ("mm1",56);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm3","mm7");
&psrlq ("mm7",7-1);
&pxor ("mm3","mm1");
&psllq ("mm1",63-56);
&pxor ("mm3","mm7");
&psrlq ("mm7",8-7);
&pxor ("mm3","mm1");
&movq ("mm1","mm5");
&psrlq ("mm5",19-6);
&pxor ("mm7","mm3"); # sigma0
&psllq ("mm6",3);
&pxor ("mm1","mm5");
&paddq ("mm7",&QWP(8*(9+16),"esp"));
&pxor ("mm1","mm6");
&psrlq ("mm5",61-19);
&paddq ("mm7",&QWP(8*(9+16-9),"esp"));
&pxor ("mm1","mm5");
&psllq ("mm6",45-3);
&movq ("mm5",$Fsse2); # load f
&pxor ("mm1","mm6"); # sigma1
&movq ("mm6",$Gsse2); # load g
&paddq ("mm7","mm1"); # X[i]
#&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15
&BODY_00_15_sse2(2);
}
&dec ("edx");
&jnz (&label("16_79_sse2"));
#&movq ($A,$Asse2);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm1",$Bsse2);
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
&movq ("mm5",$Fsse2);
&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&mov ("eax",8*80);
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&lea ("esp",&DWP(0,"esp","eax")); # destroy frame
&sub ($K512,"eax"); # rewind K
&cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
&jb (&label("loop_sse2"));
&mov ("esp",&DWP(8*10+12,"esp")); # restore sp
&emms ();
&function_end_A();
&set_label("SSSE3",32);
{ my ($cnt,$frame)=("ecx","edx");
my @X=map("xmm$_",(0..7));
my $j;
my $i=0;
&lea ($frame,&DWP(-64,"esp"));
&sub ("esp",256);
# fixed stack frame layout
#
# +0 A B C D E F G H # backing store
# +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area
# +192 # XMM off-load ring buffer
# +256 # saved parameters
&movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask
&movdqu (@X[0],&QWP(0,"edi"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) {
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&jmp (&label("loop_ssse3"));
&nop ();
&set_label("loop_ssse3",32);
&movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1]
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3]
&lea ($K512,&DWP(16*8,$K512));
#&movq ($Asse2,$A); # off-load A-H
&movq ($Bsse2,"mm1");
&mov ("ebx","edi");
&movq ($Csse2,$BxC);
&lea ("edi",&DWP(128,"edi")); # advance input
&movq ($Dsse2,"mm3");
&cmp ("edi","eax");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&cmovb ("ebx","edi");
&movq ($Gsse2,"mm6");
&mov ("ecx",4); # loop counter
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&pxor ("mm3","mm3"); # magic
&jmp (&label("00_47_ssse3"));
sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2
(
'&movq ("mm1",$E)', # %mm1 is sliding right
'&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
'&pxor ("mm5","mm6")', # f^=g
'&psrlq ("mm1",14)',
'&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e
'&pand ("mm5",$E)', # f&=e
'&psllq ($E,23)', # $E is sliding left
'&paddq ($A,"mm3")', # [h+=Maj(a,b,c)]
'&movq ("mm3","mm1")', # %mm3 is T1
'&psrlq("mm1",4)',
'&pxor ("mm5","mm6")', # Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psllq($E,23)',
'&pxor ("mm3","mm1")',
'&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a
'&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psrlq("mm1",23)',
'&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h
'&pxor ("mm3","mm1")',
'&psllq($E,4)',
'&pxor ("mm3",$E)', # T1=Sigma1_512(e)
'&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round
'&paddq ("mm3","mm7")', # T1+=X[i]
'&movq ("mm5",$A)', # %mm5 is sliding right
'&psrlq("mm5",28)',
'&paddq ($E,"mm3")', # d += T1
'&movq ("mm6",$A)', # %mm6 is sliding left
'&movq ("mm7","mm5")',
'&psllq("mm6",25)',
'&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b
'&psrlq("mm5",6)',
'&pxor ("mm7","mm6")',
'&psllq("mm6",5)',
'&pxor ("mm7","mm5")',
'&pxor ($A,"mm1")', # a^b, b^c in next round
'&psrlq("mm5",5)',
'&pxor ("mm7","mm6")',
'&pand ($BxC,$A)', # (b^c)&(a^b)
'&psllq("mm6",6)',
'&pxor ("mm7","mm5")',
'&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c)
'&pxor ("mm6","mm7")', # Sigma0_512(a)
'&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f
'&paddq ($BxC,"mm6")', # h+=Sigma0(a)
'&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g
'($A,$BxC) = ($BxC,$A); $i--;'
);
}
&set_label("00_47_ssse3",32);
for(;$j<16;$j++) {
my ($t0,$t2,$t1)=@X[2..4];
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa ($t2,@X[5]);
&movdqa (@X[1],$t0); # restore @X[1]
&palignr ($t0,@X[0],8); # X[1..2]
&movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4]
&palignr ($t2,@X[4],8); # X[9..10]
&movdqa ($t1,$t0);
&psrlq ($t0,7);
&paddq (@X[0],$t2); # X[0..1] += X[9..10]
&movdqa ($t2,$t1);
&psrlq ($t1,1);
&psllq ($t2,64-8);
&pxor ($t0,$t1);
&psrlq ($t1,8-1);
&pxor ($t0,$t2);
&psllq ($t2,8-1);
&pxor ($t0,$t1);
&movdqa ($t1,@X[7]);
&pxor ($t0,$t2); # sigma0(X[1..2])
&movdqa ($t2,@X[7]);
&psrlq ($t1,6);
&paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2])
&movdqa ($t0,@X[7]);
&psrlq ($t2,19);
&psllq ($t0,64-61);
&pxor ($t1,$t2);
&psrlq ($t2,61-19);
&pxor ($t1,$t0);
&psllq ($t0,61-19);
&pxor ($t1,$t2);
&movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
&pxor ($t1,$t0); # sigma0(X[1..2])
&movdqa ($t0,&QWP(16*($j%8),$K512));
eval(shift(@insns));
&paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddq ($t0,@X[0]);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
&lea ($K512,&DWP(16*8,$K512));
&dec ("ecx");
&jnz (&label("00_47_ssse3"));
&movdqa (@X[1],&QWP(0,$K512)); # byte swap mask
&lea ($K512,&DWP(-80*8,$K512)); # rewind
&movdqu (@X[0],&QWP(0,"ebx"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) { # load next or same block
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&movq ($A,$Asse2); # load A-H
&movq ("mm1",$Bsse2);
&paddq ($A,"mm3"); # from BODY_00_15
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
#&movq ("mm5",$Fsse2);
#&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&cmp ("edi","eax") # are we done yet?
&jb (&label("loop_ssse3"));
&mov ("esp",&DWP(64+12,$frame)); # restore sp
&emms ();
}
&function_end_A();
}
&set_label("loop_x86",16);
# copy input block to stack reversing byte and qword order
for ($i=0;$i<8;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
&mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
&bswap ("ebx");
&bswap ("ecx");
&bswap ("edx");
&push ("eax");
&push ("ebx");
&push ("ecx");
&push ("edx");
}
&add ("edi",128);
&sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
&mov (&DWP(8*(9+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&lea ("edi",&DWP(8,"esp"));
&mov ("ecx",16);
&data_word(0xA5F3F689); # rep movsd
&set_label("00_15_x86",16);
&BODY_00_15_x86();
&cmp (&LB("edx"),0x94);
&jne (&label("00_15_x86"));
&set_label("16_79_x86",16);
#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
# LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
# HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
&mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",1); # lo>>1
&mov ("edi","edx");
&shr ("edx",1); # hi>>1
&mov ("eax","ecx");
&shl ("esi",24); # lo<<24
&mov ("ebx","edx");
&shl ("edi",24); # hi<<24
&xor ("ebx","esi");
&shr ("ecx",7-1); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-1); # hi>>7
&xor ("eax","ecx");
&shl ("esi",31-24); # lo<<31
&xor ("ebx","edx");
&shl ("edi",25-24); # hi<<25
&xor ("ebx","esi");
&shr ("ecx",8-7); # lo>>8
&xor ("eax","edi");
&shr ("edx",8-7); # hi>>8
&xor ("eax","ecx");
&shl ("edi",31-25); # hi<<31
&xor ("ebx","edx");
&xor ("eax","edi"); # T1 = sigma0(X[-15])
&mov (&DWP(0,"esp"),"eax");
&mov (&DWP(4,"esp"),"ebx"); # put T1 away
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
# LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
# HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
&mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",6); # lo>>6
&mov ("edi","edx");
&shr ("edx",6); # hi>>6
&mov ("eax","ecx");
&shl ("esi",3); # lo<<3
&mov ("ebx","edx");
&shl ("edi",3); # hi<<3
&xor ("eax","esi");
&shr ("ecx",19-6); # lo>>19
&xor ("ebx","edi");
&shr ("edx",19-6); # hi>>19
&xor ("eax","ecx");
&shl ("esi",13-3); # lo<<13
&xor ("ebx","edx");
&shl ("edi",13-3); # hi<<13
&xor ("ebx","esi");
&shr ("ecx",29-19); # lo>>29
&xor ("eax","edi");
&shr ("edx",29-19); # hi>>29
&xor ("ebx","ecx");
&shl ("edi",26-13); # hi<<26
&xor ("eax","edx");
&xor ("eax","edi"); # sigma1(X[-2])
&mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
&add ("eax",&DWP(0,"esp"));
&adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
&mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
&mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += X[-16]
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += X[-7]
&mov (&DWP(8*(9+15)+0,"esp"),"eax");
&mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
&BODY_00_15_x86();
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_x86"));
&mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
&mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"esi"));
&mov ("ebx",&DWP($i*16+4,"esi"));
&mov ("ecx",&DWP($i*16+8,"esi"));
&mov ("edx",&DWP($i*16+12,"esi"));
&add ("eax",&DWP(8+($i*16)+0,"esp"));
&adc ("ebx",&DWP(8+($i*16)+4,"esp"));
&mov (&DWP($i*16+0,"esi"),"eax");
&mov (&DWP($i*16+4,"esi"),"ebx");
&add ("ecx",&DWP(8+($i*16)+8,"esp"));
&adc ("edx",&DWP(8+($i*16)+12,"esp"));
&mov (&DWP($i*16+8,"esi"),"ecx");
&mov (&DWP($i*16+12,"esi"),"edx");
}
&add ("esp",8*(9+16+80)); # destroy frame
&sub ($K512,8*80); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
&jb (&label("loop_x86"));
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
&set_label("K512",64); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
&data_word(0x23ef65cd,0x71374491); # u64
&data_word(0xec4d3b2f,0xb5c0fbcf); # u64
&data_word(0x8189dbbc,0xe9b5dba5); # u64
&data_word(0xf348b538,0x3956c25b); # u64
&data_word(0xb605d019,0x59f111f1); # u64
&data_word(0xaf194f9b,0x923f82a4); # u64
&data_word(0xda6d8118,0xab1c5ed5); # u64
&data_word(0xa3030242,0xd807aa98); # u64
&data_word(0x45706fbe,0x12835b01); # u64
&data_word(0x4ee4b28c,0x243185be); # u64
&data_word(0xd5ffb4e2,0x550c7dc3); # u64
&data_word(0xf27b896f,0x72be5d74); # u64
&data_word(0x3b1696b1,0x80deb1fe); # u64
&data_word(0x25c71235,0x9bdc06a7); # u64
&data_word(0xcf692694,0xc19bf174); # u64
&data_word(0x9ef14ad2,0xe49b69c1); # u64
&data_word(0x384f25e3,0xefbe4786); # u64
&data_word(0x8b8cd5b5,0x0fc19dc6); # u64
&data_word(0x77ac9c65,0x240ca1cc); # u64
&data_word(0x592b0275,0x2de92c6f); # u64
&data_word(0x6ea6e483,0x4a7484aa); # u64
&data_word(0xbd41fbd4,0x5cb0a9dc); # u64
&data_word(0x831153b5,0x76f988da); # u64
&data_word(0xee66dfab,0x983e5152); # u64
&data_word(0x2db43210,0xa831c66d); # u64
&data_word(0x98fb213f,0xb00327c8); # u64
&data_word(0xbeef0ee4,0xbf597fc7); # u64
&data_word(0x3da88fc2,0xc6e00bf3); # u64
&data_word(0x930aa725,0xd5a79147); # u64
&data_word(0xe003826f,0x06ca6351); # u64
&data_word(0x0a0e6e70,0x14292967); # u64
&data_word(0x46d22ffc,0x27b70a85); # u64
&data_word(0x5c26c926,0x2e1b2138); # u64
&data_word(0x5ac42aed,0x4d2c6dfc); # u64
&data_word(0x9d95b3df,0x53380d13); # u64
&data_word(0x8baf63de,0x650a7354); # u64
&data_word(0x3c77b2a8,0x766a0abb); # u64
&data_word(0x47edaee6,0x81c2c92e); # u64
&data_word(0x1482353b,0x92722c85); # u64
&data_word(0x4cf10364,0xa2bfe8a1); # u64
&data_word(0xbc423001,0xa81a664b); # u64
&data_word(0xd0f89791,0xc24b8b70); # u64
&data_word(0x0654be30,0xc76c51a3); # u64
&data_word(0xd6ef5218,0xd192e819); # u64
&data_word(0x5565a910,0xd6990624); # u64
&data_word(0x5771202a,0xf40e3585); # u64
&data_word(0x32bbd1b8,0x106aa070); # u64
&data_word(0xb8d2d0c8,0x19a4c116); # u64
&data_word(0x5141ab53,0x1e376c08); # u64
&data_word(0xdf8eeb99,0x2748774c); # u64
&data_word(0xe19b48a8,0x34b0bcb5); # u64
&data_word(0xc5c95a63,0x391c0cb3); # u64
&data_word(0xe3418acb,0x4ed8aa4a); # u64
&data_word(0x7763e373,0x5b9cca4f); # u64
&data_word(0xd6b2b8a3,0x682e6ff3); # u64
&data_word(0x5defb2fc,0x748f82ee); # u64
&data_word(0x43172f60,0x78a5636f); # u64
&data_word(0xa1f0ab72,0x84c87814); # u64
&data_word(0x1a6439ec,0x8cc70208); # u64
&data_word(0x23631e28,0x90befffa); # u64
&data_word(0xde82bde9,0xa4506ceb); # u64
&data_word(0xb2c67915,0xbef9a3f7); # u64
&data_word(0xe372532b,0xc67178f2); # u64
&data_word(0xea26619c,0xca273ece); # u64
&data_word(0x21c0c207,0xd186b8c7); # u64
&data_word(0xcde0eb1e,0xeada7dd6); # u64
&data_word(0xee6ed178,0xf57d4f7f); # u64
&data_word(0x72176fba,0x06f067aa); # u64
&data_word(0xa2c898a6,0x0a637dc5); # u64
&data_word(0xbef90dae,0x113f9804); # u64
&data_word(0x131c471b,0x1b710b35); # u64
&data_word(0x23047d84,0x28db77f5); # u64
&data_word(0x40c72493,0x32caab7b); # u64
&data_word(0x15c9bebc,0x3c9ebe0a); # u64
&data_word(0x9c100d4c,0x431d67c4); # u64
&data_word(0xcb3e42b6,0x4cc5d4be); # u64
&data_word(0xfc657e2a,0x597f299c); # u64
&data_word(0x3ad6faec,0x5fcb6fab); # u64
&data_word(0x4a475817,0x6c44198c); # u64
&data_word(0x04050607,0x00010203); # byte swap
&data_word(0x0c0d0e0f,0x08090a0b); # mask
&function_end_B("sha512_block_data_order");
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,641 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012.
#
# Improve NEON performance by 12% on Snapdragon S4. In absolute
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
#if __ARM_ARCH>=7
it eq @ Thumb2 thing, sanity check in ARM
#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#ifndef __KERNEL__
# include <openssl/arm_arch.h>
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
#else
# define __ARM_MAX_ARCH__ 7
# define VFP_ABI_PUSH
# define VFP_ABI_POP
#endif
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
#if defined(__thumb2__)
.syntax unified
.thumb
# define adrl adr
#else
.code 32
#endif
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.global sha512_block_data_order_nohw
.type sha512_block_data_order_nohw,%function
sha512_block_data_order_nohw:
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
adr $Ktbl,K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
#if __ARM_ARCH>=7
ittt eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
#if $i>0
vadd.i64 $a,$Maj @ h+=Maj from the past
#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
veor $t1,$t0
vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
veor $t2,$t1 @ Sigma1(e)
vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vadd.i64 $T1,$t2
vshr.u64 $t2,$a,#@Sigma0[2]
vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
veor $h,$t0,$t1
vadd.i64 $T1,$K
vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
vadd.i64 $d,$T1
vadd.i64 $Maj,$T1
@ vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha512_block_data_order_neon
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
VFP_ABI_POP
ret @ bx lr
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
print $code;
close STDOUT or die "error closing STDOUT: $!"; # enforce flush

View File

@@ -0,0 +1,583 @@
#! /usr/bin/env perl
# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
if ($output =~ /512/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="sha${BITS}_block_data_order_nohw";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __AARCH64EB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#ifndef __KERNEL__
# include <openssl/arm_arch.h>
#endif
.text
.globl $func
.type $func,%function
.align 6
$func:
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adrp $Ktbl,:pg_hi21:.LK$BITS
add $Ktbl,$Ktbl,:lo12:.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
AARCH64_VALIDATE_LINK_REGISTER
ret
.size $func,.-$func
.section .rodata
.align 6
.type .LK$BITS,%object
.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size .LK$BITS,.-.LK$BITS
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
.text
#ifndef __KERNEL__
.globl sha256_block_data_order_hw
.type sha256_block_data_order_hw,%function
.align 6
sha256_block_data_order_hw:
#ifdef BORINGSSL_DISPATCH_TEST
.extern BORINGSSL_function_hit
adrp x9,:pg_hi21:BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#6] // kFlag_sha256_hw
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adrp $Ktbl,:pg_hi21:.LK256
add $Ktbl,$Ktbl,:lo12:.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_data_order_hw,.-sha256_block_data_order_hw
#endif
___
}
if ($SZ==8) {
my $Ktbl="x3";
my @H = map("v$_.16b",(0..4));
my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
my @MSG=map("v$_.16b",(16..23));
my ($W0,$W1)=("v24.2d","v25.2d");
my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
$code.=<<___;
.text
#ifndef __KERNEL__
.globl sha512_block_data_order_hw
.type sha512_block_data_order_hw,%function
.align 6
sha512_block_data_order_hw:
#ifdef BORINGSSL_DISPATCH_TEST
.extern BORINGSSL_function_hit
adrp x9,:pg_hi21:BORINGSSL_function_hit
add x9, x9, :lo12:BORINGSSL_function_hit
mov w10, #1
strb w10, [x9,#8] // kFlag_sha512_hw
#endif
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
AARCH64_VALID_CALL_TARGET
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input
ld1 {@MSG[4]-@MSG[7]},[$inp],#64
ld1.64 {@H[0]-@H[3]},[$ctx] // load context
adrp $Ktbl,:pg_hi21:.LK512
add $Ktbl,$Ktbl,:lo12:.LK512
rev64 @MSG[0],@MSG[0]
rev64 @MSG[1],@MSG[1]
rev64 @MSG[2],@MSG[2]
rev64 @MSG[3],@MSG[3]
rev64 @MSG[4],@MSG[4]
rev64 @MSG[5],@MSG[5]
rev64 @MSG[6],@MSG[6]
rev64 @MSG[7],@MSG[7]
b .Loop_hw
.align 4
.Loop_hw:
ld1.64 {$W0},[$Ktbl],#16
subs $num,$num,#1
sub x4,$inp,#128
orr $AB,@H[0],@H[0] // offload
orr $CD,@H[1],@H[1]
orr $EF,@H[2],@H[2]
orr $GH,@H[3],@H[3]
csel $inp,$inp,x4,ne // conditional rewind
___
for($i=0;$i<32;$i++) {
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1.64 {$W1},[$Ktbl],#16
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512su0 @MSG[0],@MSG[1]
ext $m9_10,@MSG[4],@MSG[5],#8
sha512h @H[3],$fg,$de
sha512su1 @MSG[0],@MSG[7],$m9_10
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
for(;$i<40;$i++) {
$code.=<<___ if ($i<39);
ld1.64 {$W1},[$Ktbl],#16
___
$code.=<<___ if ($i==39);
sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind
___
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1 {@MSG[0]},[$inp],#16 // load next input
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512h @H[3],$fg,$de
rev64 @MSG[0],@MSG[0]
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
$code.=<<___;
add.i64 @H[0],@H[0],$AB // accumulate
add.i64 @H[1],@H[1],$CD
add.i64 @H[2],@H[2],$EF
add.i64 @H[3],@H[3],$GH
cbnz $num,.Loop_hw
st1.64 {@H[0]-@H[3]},[$ctx] // store context
ldr x29,[sp],#16
ret
.size sha512_block_data_order_hw,.-sha512_block_data_order_hw
#endif
___
}
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
{ my %opcode = (
"sha512h" => 0xce608000, "sha512h2" => 0xce608400,
"sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
sub unsha512 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.[ui]?8(\s)/$1/;
s/\.\w?64\b// and s/\.16b/\.2d/g or
s/\.\w?32\b// and s/\.16b/\.4s/g;
m/\bext\b/ and s/\.2d/\.16b/g or
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,634 @@
// Copyright (c) 2018, Google Inc.
// SPDX-License-Identifier: ISC
#ifndef OPENSSL_HEADER_SHA_INTERNAL_H
#define OPENSSL_HEADER_SHA_INTERNAL_H
#include <openssl/base.h>
#include <openssl/hmac.h>
#include "../../internal.h"
#include "../cpucap/internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
// Internal SHA2 constants
// SHA*_CHAINING_LENGTH is the chaining length in bytes of SHA-*
// It corresponds to the length in bytes of the h part of the state
#define SHA1_CHAINING_LENGTH 20
#define SHA224_CHAINING_LENGTH 32
#define SHA256_CHAINING_LENGTH 32
#define SHA384_CHAINING_LENGTH 64
#define SHA512_CHAINING_LENGTH 64
#define SHA512_224_CHAINING_LENGTH 64
#define SHA512_256_CHAINING_LENGTH 64
// SHA3 constants, from NIST FIPS202.
// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
#define KECCAK1600_ROWS 5
#define KECCAK1600_WIDTH 1600
#define SHA3_224_CAPACITY_BYTES 56
#define SHA3_224_CBLOCK SHA3_BLOCKSIZE(SHA3_224_DIGEST_BITLENGTH)
#define SHA3_224_DIGEST_BITLENGTH 224
#define SHA3_224_DIGEST_LENGTH 28
#define SHA3_256_CAPACITY_BYTES 64
#define SHA3_256_CBLOCK SHA3_BLOCKSIZE(SHA3_256_DIGEST_BITLENGTH)
#define SHA3_256_DIGEST_BITLENGTH 256
#define SHA3_256_DIGEST_LENGTH 32
#define SHA3_384_CAPACITY_BYTES 96
#define SHA3_384_CBLOCK SHA3_BLOCKSIZE(SHA3_384_DIGEST_BITLENGTH)
#define SHA3_384_DIGEST_BITLENGTH 384
#define SHA3_384_DIGEST_LENGTH 48
#define SHA3_512_CAPACITY_BYTES 128
#define SHA3_512_CBLOCK SHA3_BLOCKSIZE(SHA3_512_DIGEST_BITLENGTH)
#define SHA3_512_DIGEST_BITLENGTH 512
#define SHA3_512_DIGEST_LENGTH 64
#define SHA3_BLOCKSIZE(bitlen) (KECCAK1600_WIDTH - bitlen * 2) / 8
#define SHA3_PAD_CHAR 0x06
// SHAKE constants, from NIST FIPS202.
// https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
#define SHAKE_PAD_CHAR 0x1F
#define SHAKE128_BLOCKSIZE ((KECCAK1600_WIDTH - 128 * 2) / 8)
#define SHAKE256_BLOCKSIZE ((KECCAK1600_WIDTH - 256 * 2) / 8)
#define XOF_BLOCKBYTES SHAKE128_BLOCKSIZE
// SHAKE128 has the maximum block size among the SHA3/SHAKE algorithms.
#define SHA3_MAX_BLOCKSIZE SHAKE128_BLOCKSIZE
// Define state flag values for Keccak-based functions
#define KECCAK1600_STATE_ABSORB 0
// KECCAK1600_STATE_SQUEEZE is set when |SHAKE_Squeeze| is called.
// It remains set while |SHAKE_Squeeze| is called repeatedly to output
// chunks of the XOF output.
#define KECCAK1600_STATE_SQUEEZE 1
// KECCAK1600_STATE_FINAL is set once |SHAKE_Final| is called
// so that |SHAKE_Squeeze| cannot be called anymore.
#define KECCAK1600_STATE_FINAL 2
typedef struct keccak_ctx_st KECCAK1600_CTX;
// The data buffer should have at least the maximum number of
// block size bytes to fit any SHA3/SHAKE block length.
struct keccak_ctx_st {
uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS];
size_t block_size; // cached ctx->digest->block_size
size_t md_size; // output length, variable in XOF (SHAKE)
size_t buf_load; // used bytes in below buffer
uint8_t buf[SHA3_MAX_BLOCKSIZE]; // should have at least the max data block size bytes
uint8_t pad; // padding character
uint8_t state; // denotes the keccak phase (absorb, squeeze, final)
};
// To avoid externalizing KECCAK1600_CTX, we hard-code the context size in
// hmac.h's |md_ctx_union| and use a compile time check here to make sure
// |KECCAK1600_CTX|'s size never exceeds that of |md_ctx_union|. This means
// that whenever a new field is added to |keccak_ctx_st| we must also update
// the hard-coded size of |sha3| in hmac.h's |md_ctx_union| with the new
// value given by |sizeof(keccaak_ctx_st)|.
OPENSSL_STATIC_ASSERT(sizeof(KECCAK1600_CTX) <= sizeof(union md_ctx_union),
hmac_md_ctx_union_sha3_size_needs_update)
// KECCAK1600 x4 batched context structure
typedef struct keccak_ctx_st_x4 KECCAK1600_CTX_x4;
struct keccak_ctx_st_x4 {
uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS];
};
// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is
// defined in assembly.
#if defined(OPENSSL_PPC64LE)
#define SHA1_ALTIVEC
void sha1_block_data_order(uint32_t *state, const uint8_t *data,
size_t num_blocks);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM)
#define SHA1_ASM_NOHW
#define SHA256_ASM_NOHW
#define SHA512_ASM_NOHW
#define SHA1_ASM_HW
OPENSSL_INLINE int sha1_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA1_capable();
}
#define SHA1_ASM_NEON
void sha1_block_data_order_neon(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA256_ASM_HW
OPENSSL_INLINE int sha256_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA256_capable();
}
#define SHA256_ASM_NEON
void sha256_block_data_order_neon(uint32_t state[8], const uint8_t *data,
size_t num);
// Armv8.2 SHA-512 instructions are not available in 32-bit.
#define SHA512_ASM_NEON
void sha512_block_data_order_neon(uint64_t state[8], const uint8_t *data,
size_t num);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
#define SHA1_ASM_NOHW
#define SHA256_ASM_NOHW
#define SHA512_ASM_NOHW
#define SHA1_ASM_HW
OPENSSL_INLINE int sha1_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA1_capable();
}
#define SHA256_ASM_HW
OPENSSL_INLINE int sha256_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA256_capable();
}
#define SHA512_ASM_HW
OPENSSL_INLINE int sha512_hw_capable(void) {
return CRYPTO_is_ARMv8_SHA512_capable();
}
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
#define SHA1_ASM_NOHW
#define SHA256_ASM_NOHW
#define SHA1_ASM_SSSE3
OPENSSL_INLINE int sha1_ssse3_capable(void) {
// TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
// say to.
return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
}
void sha1_block_data_order_ssse3(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA1_ASM_AVX
OPENSSL_INLINE int sha1_avx_capable(void) {
// Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
// discussion in sha1-586.pl.
//
// TODO(davidben): Should we enable SHAEXT on 32-bit x86?
// TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
// say to.
return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
CRYPTO_is_FXSR_capable();
}
void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA256_ASM_SSSE3
OPENSSL_INLINE int sha256_ssse3_capable(void) {
// TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
// say to.
return CRYPTO_is_SSSE3_capable() && CRYPTO_is_FXSR_capable();
}
void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
size_t num);
#define SHA256_ASM_AVX
OPENSSL_INLINE int sha256_avx_capable(void) {
// Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
// discussion in sha1-586.pl.
//
// TODO(davidben): Should we enable SHAEXT on 32-bit x86?
// TODO(davidben): Do we need to check the FXSR bit? The Intel manual does not
// say to.
return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu() &&
CRYPTO_is_FXSR_capable();
}
void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
size_t num);
// TODO(crbug.com/boringssl/673): Move the remaining CPU dispatch to C.
#define SHA512_ASM
void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
size_t num_blocks);
#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define SHA1_ASM_NOHW
#define SHA256_ASM_NOHW
#define SHA512_ASM_NOHW
#define SHA1_ASM_HW
OPENSSL_INLINE int sha1_hw_capable(void) {
return CRYPTO_is_SHAEXT_capable() && CRYPTO_is_SSSE3_capable();
}
#define SHA1_ASM_AVX2
OPENSSL_INLINE int sha1_avx2_capable(void) {
// TODO: Simplify this logic, which was extracted from the assembly:
// * Does AVX2 imply SSSE3?
// * sha1_block_data_order_avx2 does not seem to use SSSE3 instructions.
return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_BMI1_capable() && CRYPTO_is_SSSE3_capable();
}
void sha1_block_data_order_avx2(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA1_ASM_AVX
OPENSSL_INLINE int sha1_avx_capable(void) {
// TODO: Simplify this logic, which was extracted from the assembly:
// * Does AVX imply SSSE3?
// * sha1_block_data_order_avx does not seem to use SSSE3 instructions.
// Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
// discussion in sha1-586.pl.
return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
CRYPTO_is_intel_cpu();
}
void sha1_block_data_order_avx(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA1_ASM_SSSE3
OPENSSL_INLINE int sha1_ssse3_capable(void) {
return CRYPTO_is_SSSE3_capable();
}
void sha1_block_data_order_ssse3(uint32_t state[5], const uint8_t *data,
size_t num);
#define SHA256_ASM_HW
OPENSSL_INLINE int sha256_hw_capable(void) {
return CRYPTO_is_SHAEXT_capable();
}
#define SHA256_ASM_AVX
OPENSSL_INLINE int sha256_avx_capable(void) {
// TODO: Simplify this logic, which was extracted from the assembly:
// * Does AVX imply SSSE3?
// * sha256_block_data_order_avx does not seem to use SSSE3 instructions.
// Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
// discussion in sha1-586.pl.
return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
CRYPTO_is_intel_cpu();
}
void sha256_block_data_order_avx(uint32_t state[8], const uint8_t *data,
size_t num);
#define SHA256_ASM_SSSE3
OPENSSL_INLINE int sha256_ssse3_capable(void) {
return CRYPTO_is_SSSE3_capable();
}
void sha256_block_data_order_ssse3(uint32_t state[8], const uint8_t *data,
size_t num);
#define SHA512_ASM_AVX
OPENSSL_INLINE int sha512_avx_capable(void) {
// TODO: Simplify this logic, which was extracted from the assembly:
// * Does AVX imply SSSE3?
// * sha512_block_data_order_avx does not seem to use SSSE3 instructions.
// Pre-Zen AMD CPUs had slow SHLD/SHRD; Zen added the SHA extension; see the
// discussion in sha1-586.pl.
return CRYPTO_is_AVX_capable() && CRYPTO_is_SSSE3_capable() &&
CRYPTO_is_intel_cpu();
}
void sha512_block_data_order_avx(uint64_t state[8], const uint8_t *data,
size_t num);
#endif
#if defined(SHA1_ASM_HW)
void sha1_block_data_order_hw(uint32_t state[5], const uint8_t *data,
size_t num);
#endif
#if defined(SHA1_ASM_NOHW)
void sha1_block_data_order_nohw(uint32_t state[5], const uint8_t *data,
size_t num);
#endif
#if defined(SHA256_ASM_HW)
void sha256_block_data_order_hw(uint32_t state[8], const uint8_t *data,
size_t num);
#endif
#if defined(SHA256_ASM_NOHW)
void sha256_block_data_order_nohw(uint32_t state[8], const uint8_t *data,
size_t num);
#endif
#if defined(SHA512_ASM_HW)
void sha512_block_data_order_hw(uint64_t state[8], const uint8_t *data,
size_t num);
#endif
#if defined(SHA512_ASM_NOHW)
void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *data,
size_t num);
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(OPENSSL_AARCH64)
#define KECCAK1600_ASM
#if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)
#define KECCAK1600_S2N_BIGNUM_ASM
#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
#endif
#endif
#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
#if defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)
#define KECCAK1600_ASM
#define KECCAK1600_S2N_BIGNUM_ASM
#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
#endif
#endif
#endif
// SHAx_Init_from_state is a low-level function that initializes |sha| with a
// custom state. |h| is the hash state in big endian. |n| is the number of bits
// processed at this point. It must be a multiple of |SHAy_CBLOCK*8|,
// where SHAy=SHA1 if SHAx=SHA1, SHAy=SHA256 if SHAx=SHA224 or SHA256, and
// SHAy=SHA512 otherwise.
// This function returns one on success and zero on error.
// This function is for internal use only and should never be directly called.
OPENSSL_EXPORT int SHA1_Init_from_state(
SHA_CTX *sha, const uint8_t h[SHA1_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA224_Init_from_state(
SHA256_CTX *sha, const uint8_t h[SHA224_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA256_Init_from_state(
SHA256_CTX *sha, const uint8_t h[SHA256_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA384_Init_from_state(
SHA512_CTX *sha, const uint8_t h[SHA384_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA512_Init_from_state(
SHA512_CTX *sha, const uint8_t h[SHA512_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA512_224_Init_from_state(
SHA512_CTX *sha, const uint8_t h[SHA512_224_CHAINING_LENGTH], uint64_t n);
OPENSSL_EXPORT int SHA512_256_Init_from_state(
SHA512_CTX *sha, const uint8_t h[SHA512_256_CHAINING_LENGTH], uint64_t n);
// SHAx_get_state is a low-level function that exports the hash state in big
// endian into |out_h| and the number of bits processed at this point in
// |out_n|. |SHAx_Final| must not have been called before (otherwise results
// are not guaranteed). Furthermore, the number of bytes processed by
// |SHAx_Update| must be a multiple of the block length |SHAy_CBLOCK| and
// must be less than 2^61 (otherwise it fails). See comment above about
// |SHAx_Init_from_state| for the definition of SHAy.
// This function returns one on success and zero on error.
// This function is for internal use only and should never be directly called.
OPENSSL_EXPORT int SHA1_get_state(
SHA_CTX *ctx, uint8_t out_h[SHA1_CHAINING_LENGTH], uint64_t *out_n);
OPENSSL_EXPORT int SHA224_get_state(
SHA256_CTX *ctx, uint8_t out_h[SHA224_CHAINING_LENGTH], uint64_t *out_n);
OPENSSL_EXPORT int SHA256_get_state(
SHA256_CTX *ctx, uint8_t out_h[SHA256_CHAINING_LENGTH], uint64_t *out_n);
OPENSSL_EXPORT int SHA384_get_state(
SHA512_CTX *ctx, uint8_t out_h[SHA384_CHAINING_LENGTH], uint64_t *out_n);
OPENSSL_EXPORT int SHA512_get_state(
SHA512_CTX *ctx, uint8_t out_h[SHA512_CHAINING_LENGTH], uint64_t *out_n);
OPENSSL_EXPORT int SHA512_224_get_state(
SHA512_CTX *ctx, uint8_t out_h[SHA512_224_CHAINING_LENGTH],
uint64_t *out_n);
OPENSSL_EXPORT int SHA512_256_get_state(
SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENGTH],
uint64_t *out_n);
/*
* SHA3/SHAKE single-shot APIs implement SHA3 functionalities on top
* of SHA3/SHAKE API layer
*
* SHA3/SHAKE single-shot functions never fail when the later call-discipline is
* adhered to: (a) the pointers passed to the functions are valid.
*/
// SHA3_224 writes the digest of |len| bytes from |data| to |out| and returns |out|.
// There must be at least |SHA3_224_DIGEST_LENGTH| bytes of space in |out|.
// On failure |SHA3_224| returns NULL.
OPENSSL_EXPORT uint8_t *SHA3_224(const uint8_t *data, size_t len,
uint8_t out[SHA3_224_DIGEST_LENGTH]);
// SHA3_256 writes the digest of |len| bytes from |data| to |out| and returns |out|.
// There must be at least |SHA3_256_DIGEST_LENGTH| bytes of space in |out|.
// On failure |SHA3_256| returns NULL.
OPENSSL_EXPORT uint8_t *SHA3_256(const uint8_t *data, size_t len,
uint8_t out[SHA3_256_DIGEST_LENGTH]);
// SHA3_384 writes the digest of |len| bytes from |data| to |out| and returns |out|.
// There must be at least |SHA3_384_DIGEST_LENGTH| bytes of space in |out|.
// On failure |SHA3_384| returns NULL.
OPENSSL_EXPORT uint8_t *SHA3_384(const uint8_t *data, size_t len,
uint8_t out[SHA3_384_DIGEST_LENGTH]);
// SHA3_512 writes the digest of |len| bytes from |data| to |out| and returns |out|.
// There must be at least |SHA3_512_DIGEST_LENGTH| bytes of space in |out|.
// On failure |SHA3_512| returns NULL.
OPENSSL_EXPORT uint8_t *SHA3_512(const uint8_t *data, size_t len,
uint8_t out[SHA3_512_DIGEST_LENGTH]);
// SHAKE128 writes the |out_len| bytes output from |in_len| bytes |data|
// to |out| and returns |out| on success and NULL on failure.
OPENSSL_EXPORT uint8_t *SHAKE128(const uint8_t *data, const size_t in_len,
uint8_t *out, size_t out_len);
// SHAKE256 writes |out_len| bytes output from |in_len| bytes |data|
// to |out| and returns |out| on success and NULL on failure.
OPENSSL_EXPORT uint8_t *SHAKE256(const uint8_t *data, const size_t in_len,
uint8_t *out, size_t out_len);
/*
* SHA3 APIs implement SHA3 functionalities on top of FIPS202 API layer
*
* SHA3 context must go through the flow: (a) Init, (b) Update [multiple times],
* (c) Final [one time].
*
* SHA3 functions never fail when the later call-discipline is adhered to:
* (a) the context execution flow is followed (b) the pointers passed to the
* functions are valid (c) any additional per-function parameter value conditions,
* detailed above each SHA3_ function signature, is satisfied.
*/
// SHA3_Init initialises |ctx| field through |FIPS202_Init| and
// returns 1 on success and 0 on failure. When call-discipline is
// maintained and |bitlen| value corresponds to a SHA3 digest length
// in bits, this function never fails.
OPENSSL_EXPORT int SHA3_Init(KECCAK1600_CTX *ctx, size_t bitlen);
// SHA3_Update checks |ctx| pointer and |len| value, calls |FIPS202_Update|
// and returns 1 on success and 0 on failure. When call-discipline is
// maintained and |len| value corresponds to the input message length
// (including zero), this function never fails.
int SHA3_Update(KECCAK1600_CTX *ctx, const void *data, size_t len);
// SHA3_Final pads the last data block and absorbs it through |FIPS202_Finalize|.
// It then calls |Keccak1600_Squeeze| and returns 1 on success and 0 on failure.
// When call-discipline is maintained, this function never fails.
int SHA3_Final(uint8_t *md, KECCAK1600_CTX *ctx);
// SHA3_224_Init initialises |sha| and returns 1.
int SHA3_224_Init(KECCAK1600_CTX *sha);
// SHA3_224_Update adds |len| bytes from |data| to |sha| and returns 1.
int SHA3_224_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
// SHA3_224_Final adds the final padding to |sha| and writes the resulting
// digest to |out|. It returns one on success and zero on programmer error.
int SHA3_224_Final(uint8_t out[SHA3_224_DIGEST_LENGTH], KECCAK1600_CTX *sha);
// SHA3_256_Init initialises |sha| and returns 1.
int SHA3_256_Init(KECCAK1600_CTX *sha);
// SHA3_256_Update adds |len| bytes from |data| to |sha| and returns 1.
int SHA3_256_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
// SHA3_256_Final adds the final padding to |sha| and writes the resulting
// digest to |out|. It returns one on success and zero on programmer error.
int SHA3_256_Final(uint8_t out[SHA3_256_DIGEST_LENGTH], KECCAK1600_CTX *sha);
// SHA3_384_Init initialises |sha| and returns 1.
int SHA3_384_Init(KECCAK1600_CTX *sha);
// SHA3_384_Update adds |len| bytes from |data| to |sha| and returns 1.
int SHA3_384_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
// SHA3_384_Final adds the final padding to |sha| and writes the resulting
// digest to |out|. It returns one on success and zero on programmer error.
int SHA3_384_Final(uint8_t out[SHA3_384_DIGEST_LENGTH], KECCAK1600_CTX *sha);
// SHA3_512_Init initialises |sha| and returns 1.
int SHA3_512_Init(KECCAK1600_CTX *sha);
// SHA3_512_Update adds |len| bytes from |data| to |sha| and returns 1.
int SHA3_512_Update(KECCAK1600_CTX *sha, const void *data, size_t len);
// SHA3_512_Final adds the final padding to |sha| and writes the resulting
// digest to |out|. It returns one on success and zero on programmer error.
int SHA3_512_Final(uint8_t out[SHA3_512_DIGEST_LENGTH], KECCAK1600_CTX *sha);
/*
* SHAKE APIs implement SHAKE functionalities on top of FIPS202 API layer
*
* SHAKE context must go through the flow: (a) Init, (b) Absorb [multiple times],
* (c) Final [one time] or Squeeze [multiple times]
*
* SHAKE functions never fail when the later call-discipline is adhered to:
* (a) the context execution flow is followed (b) the pointers passed to the
* functions are valid (c) any additional per-function parameter value conditions,
* detailed above each SHAKE_ function signature, is satisfied.
*/
// SHAKE_Init initialises |ctx| fields through |FIPS202_Init| and
// returns 1 on success and 0 on failure. When call-discipline is
// maintained and |block_size| value corresponds to a SHAKE block size length
// in bytes, this function never fails.
int SHAKE_Init(KECCAK1600_CTX *ctx, size_t block_size);
// SHAKE_Absorb checks |ctx| pointer and |len| values. It updates and absorbs
// input blocks via |FIPS202_Update|. When call-discipline is
// maintained and |len| value corresponds to the input message length
// (including zero), this function never fails.
int SHAKE_Absorb(KECCAK1600_CTX *ctx, const void *data,
size_t len);
// SHAKE_Squeeze pads the last data block and absorbs it through
// |FIPS202_Finalize| on first call. It writes |len| bytes of incremental
// XOF output to |md| and returns 1 on success and 0 on failure. It can be
// called multiple times. When call-discipline is maintained, this function
// never fails.
int SHAKE_Squeeze(uint8_t *md, KECCAK1600_CTX *ctx, size_t len);
// SHAKE_Final writes |len| bytes of finalized extendible output to |md|, returns 1 on
// success and 0 on failure. It should be called once to finalize absorb and
// squeeze phases. Incremental XOF output should be generated via |SHAKE_Squeeze|.
// When call-discipline is maintained, this function never fails.
int SHAKE_Final(uint8_t *md, KECCAK1600_CTX *ctx, size_t len);
/*
* SHAKE128_x4_ batched APIs implement x4 SHAKE functionalities on top of FIPS202 API layer
*
* SHAKE128_x4_ context must go through the flow: (a) Init_x4, (b) Absorb_once_x4 [one time;
* maximum input length of |SHAKE128_BLOCKSIZE - 1|] (c) Squeezeblocks_x4 [multiple times]
*
* SHAKE128_x4_ functions never fail when the later call-discipline is adhered to:
* (a) the context execution flow is followed (b) the pointers passed to the
* functions are valid (c) any additional per-function parameter value conditions,
* detailed above each SHAKE128_x4_ function signature, is satisfied.
*/
// SHAKE128_Init_x4 is a batched API that operates on four independent
// Keccak bitstates. It initialises all four |ctx| fields and returns
// 1 on success and 0 on failure. When call-discipline is maintained,
// this function never fails.
OPENSSL_EXPORT int SHAKE128_Init_x4(KECCAK1600_CTX_x4 *ctx);
// SHAKE128_Absorb_once_x4 is a batched API that operates on four independent
// Keccak bitstates. It absorbs all four inputs |data0|, |data1|, |data2|, |data3|
// of equal length of |len| bytes returns 1 on success and 0 on failure. When
// is maintained and |len| value corresponds to the input messages length
// call-discipline (including zero), this function never fails.
OPENSSL_EXPORT int SHAKE128_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
const void *data2, const void *data3, size_t len);
// SHAKE128_Squeezeblocks_x4 is a batched API that operates on four independent Keccak
// bitstates. It squeezes |blks| number of blocks for all four XOF digests and returns
// 1 on success and 0 on failure. When call-discipline is maintained, this function
// never fails.
OPENSSL_EXPORT int SHAKE128_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
KECCAK1600_CTX_x4 *ctx, size_t blks);
/*
* SHAKE256_x4_ signle-shot batched API implements x4 SHAKE256 functionalities on top
* of FIPS202 API layer
*
* SHAKE256_x4_ function never fails when the later call-discipline is adhered to:
* (a) the pointers passed to the functions are valid.
*/
// SHAKE256_x4 is a batched API that operates on four independent
// Keccak bitstates. It writes all four |out_len|-byte outputs from
// |in_len|-byte inputs to |out0|, |out1|, |out2|, |out3| and returns
// 1 on success and 0 on failure.
// When call-discipline is maintained, this function never fails.
OPENSSL_EXPORT int SHAKE256_x4(const uint8_t *data0, const uint8_t *data1,
const uint8_t *data2, const uint8_t *data3,
const size_t in_len, uint8_t *out0, uint8_t *out1,
uint8_t *out2, uint8_t *out3, size_t out_len);
/*
* Keccak1600_ APIs implement Keccak absorb and squeeze phases
*/
// Keccak1600_Absorb processes the largest multiple of |r| (block size) out of
// |len| bytes and returns the remaining number of bytes.
size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS],
const uint8_t *data, size_t len, size_t r);
// Keccak1600_Absorb_once_x4 absorbs exactly |len| bytes from four inputs into four
// Keccak states, applying padding character |p|. Unlike Keccak1600_Absorb, this
// processes a single block and takes the padding character as an additional argument.
void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
const uint8_t *inp0, const uint8_t *inp1,
const uint8_t *inp2, const uint8_t *inp3,
size_t len, size_t r, uint8_t p);
// Keccak1600_Squeezeblocks_x4 squeezes |num_blocks| blocks from four Keccak states
// into four output buffers, with each block being |r| bytes.
void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3,
size_t num_blocks, size_t r);
// Keccak1600_Squeeze generates |out| value of |len| bytes (per call). It can be called
// multiple times when used as eXtendable Output Function. |padded| indicates
// whether it is the first call to Keccak1600_Squeeze; i.e., if the current block has
// been already processed and padded right after the last call to Keccak1600_Absorb.
// Squeezes full blocks of |r| bytes each. When performing multiple squeezes, any
// left over bytes from previous squeezes are not consumed, and |len| must be a
// multiple of the block size (except on the final squeeze).
OPENSSL_EXPORT void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS],
uint8_t *out, size_t len, size_t r, int padded);
#if defined(__cplusplus)
} // extern "C"
#endif
#endif // OPENSSL_HEADER_SHA_INTERNAL_H

View File

@@ -0,0 +1,515 @@
// Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
#include <assert.h>
#include "internal.h"
#include "../../internal.h"
#include "../cpucap/internal.h"
static const uint64_t iotas[] = {
0x0000000000000001ULL,
0x0000000000008082ULL,
0x800000000000808aULL,
0x8000000080008000ULL,
0x000000000000808bULL,
0x0000000080000001ULL,
0x8000000080008081ULL,
0x8000000000008009ULL,
0x000000000000008aULL,
0x0000000000000088ULL,
0x0000000080008009ULL,
0x000000008000000aULL,
0x000000008000808bULL,
0x800000000000008bULL,
0x8000000000008089ULL,
0x8000000000008003ULL,
0x8000000000008002ULL,
0x8000000000000080ULL,
0x000000000000800aULL,
0x800000008000000aULL,
0x8000000080008081ULL,
0x8000000000008080ULL,
0x0000000080000001ULL,
0x8000000080008008ULL
};
#if !defined(KECCAK1600_ASM)
static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = {
{ 0, 1, 62, 28, 27 },
{ 36, 44, 6, 55, 20 },
{ 3, 10, 43, 25, 39 },
{ 41, 45, 15, 21, 8 },
{ 18, 2, 61, 56, 14 }
};
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
(defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__loongarch__) || \
defined(__EMSCRIPTEN__)
// These platforms don't support "logical and with complement" instruction.
# define KECCAK_COMPLEMENTING_TRANSFORM
#endif
static uint64_t ROL64(uint64_t val, int offset) {
if (offset == 0) {
return val;
} else {
return (val << offset) | (val >> (64-offset));
}
}
// KECCAK_2X:
// This is the default implementation used in OpenSSL and the most efficient;
// the other implementations were removed from this file.
// This implementation is a variant of KECCAK_1X (see OpenSSL)
// This implementation allows to take temporary storage
// out of round procedure and simplify references to it by alternating
// it with actual data (see round loop below).
// It ensures best compiler interpretation to assembly and provides best
// instruction per processed byte ratio at minimal round unroll factor.
static void Round(uint64_t R[KECCAK1600_ROWS][KECCAK1600_ROWS], uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], size_t i) {
uint64_t C[KECCAK1600_ROWS], D[KECCAK1600_ROWS];
assert(i < (sizeof(iotas) / sizeof(iotas[0])));
C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
D[0] = ROL64(C[1], 1) ^ C[4];
D[1] = ROL64(C[2], 1) ^ C[0];
D[2] = ROL64(C[3], 1) ^ C[1];
D[3] = ROL64(C[4], 1) ^ C[2];
D[4] = ROL64(C[0], 1) ^ C[3];
C[0] = A[0][0] ^ D[0];
C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
R[0][1] = C[1] ^ (~C[2] | C[3]);
R[0][2] = C[2] ^ ( C[3] & C[4]);
R[0][3] = C[3] ^ ( C[4] | C[0]);
R[0][4] = C[4] ^ ( C[0] & C[1]);
#else
R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
R[0][1] = C[1] ^ (~C[2] & C[3]);
R[0][2] = C[2] ^ (~C[3] & C[4]);
R[0][3] = C[3] ^ (~C[4] & C[0]);
R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif
C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
R[1][0] = C[0] ^ (C[1] | C[2]);
R[1][1] = C[1] ^ (C[2] & C[3]);
R[1][2] = C[2] ^ (C[3] | ~C[4]);
R[1][3] = C[3] ^ (C[4] | C[0]);
R[1][4] = C[4] ^ (C[0] & C[1]);
#else
R[1][0] = C[0] ^ (~C[1] & C[2]);
R[1][1] = C[1] ^ (~C[2] & C[3]);
R[1][2] = C[2] ^ (~C[3] & C[4]);
R[1][3] = C[3] ^ (~C[4] & C[0]);
R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif
C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
R[2][0] = C[0] ^ ( C[1] | C[2]);
R[2][1] = C[1] ^ ( C[2] & C[3]);
R[2][2] = C[2] ^ (~C[3] & C[4]);
R[2][3] = ~C[3] ^ ( C[4] | C[0]);
R[2][4] = C[4] ^ ( C[0] & C[1]);
#else
R[2][0] = C[0] ^ (~C[1] & C[2]);
R[2][1] = C[1] ^ (~C[2] & C[3]);
R[2][2] = C[2] ^ (~C[3] & C[4]);
R[2][3] = C[3] ^ (~C[4] & C[0]);
R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif
C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
R[3][0] = C[0] ^ ( C[1] & C[2]);
R[3][1] = C[1] ^ ( C[2] | C[3]);
R[3][2] = C[2] ^ (~C[3] | C[4]);
R[3][3] = ~C[3] ^ ( C[4] & C[0]);
R[3][4] = C[4] ^ ( C[0] | C[1]);
#else
R[3][0] = C[0] ^ (~C[1] & C[2]);
R[3][1] = C[1] ^ (~C[2] & C[3]);
R[3][2] = C[2] ^ (~C[3] & C[4]);
R[3][3] = C[3] ^ (~C[4] & C[0]);
R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif
C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
R[4][0] = C[0] ^ (~C[1] & C[2]);
R[4][1] = ~C[1] ^ ( C[2] | C[3]);
R[4][2] = C[2] ^ ( C[3] & C[4]);
R[4][3] = C[3] ^ ( C[4] | C[0]);
R[4][4] = C[4] ^ ( C[0] & C[1]);
#else
R[4][0] = C[0] ^ (~C[1] & C[2]);
R[4][1] = C[1] ^ (~C[2] & C[3]);
R[4][2] = C[2] ^ (~C[3] & C[4]);
R[4][3] = C[3] ^ (~C[4] & C[0]);
R[4][4] = C[4] ^ (~C[0] & C[1]);
#endif
}
static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
uint64_t T[KECCAK1600_ROWS][KECCAK1600_ROWS];
size_t i;
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
A[0][1] = ~A[0][1];
A[0][2] = ~A[0][2];
A[1][3] = ~A[1][3];
A[2][2] = ~A[2][2];
A[3][2] = ~A[3][2];
A[4][0] = ~A[4][0];
#endif
for (i = 0; i < 24; i += 2) {
Round(T, A, i);
Round(A, T, i + 1);
}
#ifdef KECCAK_COMPLEMENTING_TRANSFORM
A[0][1] = ~A[0][1];
A[0][2] = ~A[0][2];
A[1][3] = ~A[1][3];
A[2][2] = ~A[2][2];
A[3][2] = ~A[3][2];
A[4][0] = ~A[4][0];
#endif
}
#endif // !KECCAK1600_ASM
// Forward declaration for KeccakF1600 function
void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]);
// Keccak1600_Absorb can be called multiple times; at each invocation the
// largest multiple of |r| out of |len| bytes are processed. The
// remaining amount of bytes is returned. This is done to spare caller
// trouble of calculating the largest multiple of |r|. |r| can be viewed
// as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
// 72, but can also be (1600 - 448)/8 = 144. All this means that message
// padding and intermediate sub-block buffering, byte- or bitwise, is
// caller's responsibility.
// KeccakF1600_XORBytes XORs |len| bytes from |inp| into the Keccak state |A|.
// |len| must be a multiple of 8.
static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len) {
assert(len <= SHA3_MAX_BLOCKSIZE);
assert((len % 8) == 0);
uint64_t *A_flat = (uint64_t *)A;
size_t w = len / 8;
for (size_t i = 0; i < w; i++) {
uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
(uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
(uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
(uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
inp += 8;
A_flat[i] ^= Ai;
}
}
size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len,
size_t r) {
assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
while (len >= r) {
KeccakF1600_XORBytes(A, inp, r);
KeccakF1600(A);
inp += r;
len -= r;
}
return len;
}
// KeccakF1600_ExtractBytes extracts |len| bytes from the Keccak state |A| into |out|.
// This function operates on up to block_size bytes (a single block). For extracting
// more data, the state must be processed again through KeccakF1600 (see Keccak1600_Squeeze).
static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len) {
uint64_t *A_flat = (uint64_t *)A;
assert(len <= SHA3_MAX_BLOCKSIZE);
size_t i = 0;
while (len != 0) {
uint64_t Ai = A_flat[i];
if (len < 8) {
for (size_t j = 0; j < len; j++) {
*out++ = (uint8_t)Ai;
Ai >>= 8;
}
return;
}
out[0] = (uint8_t)(Ai);
out[1] = (uint8_t)(Ai >> 8);
out[2] = (uint8_t)(Ai >> 16);
out[3] = (uint8_t)(Ai >> 24);
out[4] = (uint8_t)(Ai >> 32);
out[5] = (uint8_t)(Ai >> 40);
out[6] = (uint8_t)(Ai >> 48);
out[7] = (uint8_t)(Ai >> 56);
out += 8;
len -= 8;
i++;
}
}
void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len, size_t r, int padded) {
assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
while (len != 0) {
if (padded) {
KeccakF1600(A);
}
padded = 1;
size_t extract_len = len < r ? len : r;
KeccakF1600_ExtractBytes(A, out, extract_len);
out += extract_len;
len -= extract_len;
}
}
#if defined(KECCAK1600_ASM)
// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl
extern void KeccakF1600_hw(uint64_t state[25]);
#if defined(OPENSSL_AARCH64)
static void keccak_log_dispatch(size_t id) {
#if BORINGSSL_DISPATCH_TEST
BORINGSSL_function_hit[id] = 1;
#endif
}
#endif
void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
// Dispatch logic for Keccak-x1 on AArch64:
//
// 1. If ASM is disabled, we use the C implementation.
// 2. If ASM is enabled:
// - For Neoverse N1, V1, V2, we use scalar Keccak assembly from s2n-bignum
// (`sha3_keccak_f1600()`)
// leveraging lazy rotations from https://eprint.iacr.org/2022/1243.
// - Otherwise, if the Neon SHA3 extension is supported, we use the Neon
// Keccak assembly from s2n-bignum (`sha3_keccak_f1600_alt()`),
// leveraging that extension.
// - Otherwise, fall back to scalar Keccak implementation from OpenSSL,
// (`Keccak1600_hw()`), not using lazy rotations.
//
// Lazy rotations improve performance by up to 10% on CPUs with free
// Barrel shifting, which includes Neoverse N1, V1, and V2. Not all
// CPUs have free Barrel shifting (e.g. Apple M1 or Cortex-A72), so we
// don't use it by default.
//
// Neoverse V1 and V2 do support SHA3 instructions, but they are only
// implemented on 1/4 of Neon units, and are thus slower than a scalar
// implementation.
#if defined(OPENSSL_AARCH64)
#if defined(KECCAK1600_S2N_BIGNUM_ASM)
if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600
sha3_keccak_f1600((uint64_t *)A, iotas);
return;
}
#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
if (CRYPTO_is_ARMv8_SHA3_capable()) {
keccak_log_dispatch(11); // kFlag_sha3_keccak_f1600_alt
sha3_keccak_f1600_alt((uint64_t *)A, iotas);
return;
}
#endif
#endif
keccak_log_dispatch(9); // kFlag_KeccakF1600_hw
KeccakF1600_hw((uint64_t *) A);
#elif defined(OPENSSL_X86_64)
sha3_keccak_f1600((uint64_t *)A, iotas);
#endif
}
#else // KECCAK1600_ASM
void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS])
{
KeccakF1600_c(A);
}
#endif // !KECCAK1600_ASM
// KeccakF1600_XORBytes_x4 XORs |len| bytes from |inp0|, |inp1|, |inp2|, |inp3|
// into the four Keccak states in |A|. |len| must be a multiple of 8.
static void KeccakF1600_XORBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
const uint8_t *inp0, const uint8_t *inp1,
const uint8_t *inp2, const uint8_t *inp3,
size_t len) {
KeccakF1600_XORBytes(A[0], inp0, len);
KeccakF1600_XORBytes(A[1], inp1, len);
KeccakF1600_XORBytes(A[2], inp2, len);
KeccakF1600_XORBytes(A[3], inp3, len);
}
// KeccakF1600_ExtractBytes_x4 extracts |len| bytes from the four Keccak states in |A|
// into |out0|, |out1|, |out2|, |out3|.
static void KeccakF1600_ExtractBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
uint8_t *out0, uint8_t *out1,
uint8_t *out2, uint8_t *out3,
size_t len) {
KeccakF1600_ExtractBytes(A[0], out0, len);
KeccakF1600_ExtractBytes(A[1], out1, len);
KeccakF1600_ExtractBytes(A[2], out2, len);
KeccakF1600_ExtractBytes(A[3], out3, len);
}
static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) {
// Dispatch logic for Keccak-x4 on AArch64:
//
// 1. If ASM is disabled, we use 4x the C implementation.
// 2. If ASM is enabled:
// - For Neoverse N1, we use scalar batched hybrid Keccak assembly from s2n-bignum
// (`sha3_keccak4_f1600_alt()`) leveraging Neon and scalar assembly with
// lazy rotations.
// - For Neoverse V1, V2, we use SIMD batched hybrid Keccak assembly from s2n-bignum
// (`sha3_keccak4_f1600_alt2()`) leveraging Neon, Neon SHA3 extension,
// and scalar assembly with lazy rotations.
// - Otherwise, if the Neon SHA3 extension is supported, we use the 2-fold
// Keccak assembly from s2n-bignum (`sha3_keccak2_f1600()`) twice,
// which is a straightforward implementation using the SHA3 extension.
// - Otherwise, fall back to four times the 1-fold Keccak implementation
// (which has its own dispatch logic).
#if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64)
if (CRYPTO_is_Neoverse_N1()) {
keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt
sha3_keccak4_f1600_alt((uint64_t *)A, iotas);
return;
}
#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
if (CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
keccak_log_dispatch(14); // kFlag_sha3_keccak4_f1600_alt2
sha3_keccak4_f1600_alt2((uint64_t *)A, iotas);
return;
}
if (CRYPTO_is_ARMv8_SHA3_capable()) {
keccak_log_dispatch(12); // kFlag_sha3_keccak2_f1600
// Use 2-fold function twice: A[0:1] and A[2:3]
sha3_keccak2_f1600((uint64_t *)&A[0], iotas);
sha3_keccak2_f1600((uint64_t *)&A[2], iotas);
return;
}
#endif
#endif
// Fallback: 4x individual KeccakF1600 calls (each with their own dispatch)
KeccakF1600(A[0]);
KeccakF1600(A[1]);
KeccakF1600(A[2]);
KeccakF1600(A[3]);
}
// One-shot absorb + finalize. Note that in contract to non-batched Keccak,
// this does _not_ run a Keccak permutation at the end, allowing for a uniform
// implementation of Keccak1600_Squeezeblocks_x4() without `padded` parameter
// as in the non-batched implementation.
void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
const uint8_t *inp0, const uint8_t *inp1,
const uint8_t *inp2, const uint8_t *inp3,
size_t len, size_t r, uint8_t p) {
assert(r <= SHA3_MAX_BLOCKSIZE);
while (len >= r) {
KeccakF1600_XORBytes_x4(A, inp0, inp1, inp2, inp3, r);
Keccak1600_x4(A);
inp0 += r;
inp1 += r;
inp2 += r;
inp3 += r;
len -= r;
}
// Build 16-byte aligned final blocks for each input
alignas(16) uint8_t final[4][SHA3_MAX_BLOCKSIZE] = {{0}};
// Copy the remainder bytes to final blocks
OPENSSL_memcpy(final[0], inp0, len);
OPENSSL_memcpy(final[1], inp1, len);
OPENSSL_memcpy(final[2], inp2, len);
OPENSSL_memcpy(final[3], inp3, len);
if (len == r - 1) {
p |= 128;
} else {
final[0][r - 1] |= 128;
final[1][r - 1] |= 128;
final[2][r - 1] |= 128;
final[3][r - 1] |= 128;
}
final[0][len] |= p;
final[1][len] |= p;
final[2][len] |= p;
final[3][len] |= p;
KeccakF1600_XORBytes_x4(A, final[0], final[1], final[2], final[3], r);
// Clean up final blocks to avoid stack leakage
OPENSSL_cleanse(final, sizeof(final));
}
void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out0, uint8_t *out1,
uint8_t *out2, uint8_t *out3,
size_t num_blocks, size_t r) {
while (num_blocks != 0) {
Keccak1600_x4(A);
KeccakF1600_ExtractBytes_x4(A, out0, out1, out2, out3, r);
out0 += r;
out1 += r;
out2 += r;
out3 += r;
num_blocks--;
}
}

View File

@@ -0,0 +1,306 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
// Altivec-optimized SHA1 in C. This is tested on ppc64le only.
//
// References:
// https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
// http://arctic.org/~dean/crypto/sha1.html
//
// This code used the generic SHA-1 from OpenSSL as a basis and AltiVec
// optimisations were added on top.
#include <openssl/sha.h>
#if defined(OPENSSL_PPC64LE)
#include <altivec.h>
static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); }
typedef vector unsigned int vec_uint32_t;
typedef vector unsigned char vec_uint8_t;
// Vector constants
static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4,
11, 10, 9, 8, 15, 14, 13, 12};
// Shift amounts for byte and bit shifts and rotations
static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32,
32, 32, 32, 32, 32, 32, 32, 32};
static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96,
96, 96, 96, 96, 96, 96, 96, 96};
#define K_00_19 0x5a827999UL
#define K_20_39 0x6ed9eba1UL
#define K_40_59 0x8f1bbcdcUL
#define K_60_79 0xca62c1d6UL
// Vector versions of the above.
static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19};
static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39};
static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59};
static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79};
// vector message scheduling: compute message schedule for round i..i+3 where i
// is divisible by 4. We return the schedule w[i..i+3] as a vector. In
// addition, we also precompute sum w[i..+3] and an additive constant K. This
// is done to offload some computation of f() in the integer execution units.
//
// Byte shifting code below may not be correct for big-endian systems.
static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data,
vec_uint32_t k) {
const vector unsigned char unaligned_data =
vec_vsx_ld(0, (const unsigned char*) data);
const vec_uint32_t v = (vec_uint32_t) unaligned_data;
const vec_uint32_t w = vec_perm(v, v, k_swap_endianness);
vec_st(w + k, 0, pre_added);
return w;
}
// Compute w[i..i+3] using these steps for i in [16, 20, 24, 28]
//
// w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1
// w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1
// w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1
// w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1
//
// w[ i] = w'[ i]
// w[i+1] = w'[i+1]
// w[i+2] = w'[i+2]
// w[i+3] = w'[i+3] ^ (w'[i] <<< 1)
static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4,
vec_uint32_t minus_8, vec_uint32_t minus_12,
vec_uint32_t minus_16, vec_uint32_t k) {
const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes);
const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8);
const vec_uint32_t k_1_bit = vec_splat_u32(1);
const vec_uint32_t w_prime =
vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit);
const vec_uint32_t w =
w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit);
vec_st(w + k, 0, pre_added);
return w;
}
// Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76]
// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2
static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4,
vec_uint32_t minus_8, vec_uint32_t minus_16,
vec_uint32_t minus_28, vec_uint32_t minus_32,
vec_uint32_t k) {
const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8);
const vec_uint32_t k_2_bits = vec_splat_u32(2);
const vec_uint32_t w =
vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits);
vec_st(w + k, 0, pre_added);
return w;
}
// As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
// to the code in F_00_19. Wei attributes these optimisations to Peter
// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another
// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
#define F_60_79(b, c, d) F_20_39(b, c, d)
// We pre-added the K constants during message scheduling.
#define BODY_00_19(i, a, b, c, d, e, f) \
do { \
(f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \
(b) = rotate((b), 30); \
} while (0)
#define BODY_20_39(i, a, b, c, d, e, f) \
do { \
(f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \
(b) = rotate((b), 30); \
} while (0)
#define BODY_40_59(i, a, b, c, d, e, f) \
do { \
(f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \
(b) = rotate((b), 30); \
} while (0)
#define BODY_60_79(i, a, b, c, d, e, f) \
do { \
(f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \
(b) = rotate((b), 30); \
} while (0)
void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
uint32_t A, B, C, D, E, T;
A = state[0];
B = state[1];
C = state[2];
D = state[3];
E = state[4];
for (;;) {
vec_uint32_t vw[20];
const uint32_t *w = (const uint32_t *)&vw;
vec_uint32_t k = K_00_19_x_4;
const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k);
BODY_00_19(0, A, B, C, D, E, T);
BODY_00_19(1, T, A, B, C, D, E);
BODY_00_19(2, E, T, A, B, C, D);
BODY_00_19(3, D, E, T, A, B, C);
const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k);
BODY_00_19(4, C, D, E, T, A, B);
BODY_00_19(5, B, C, D, E, T, A);
BODY_00_19(6, A, B, C, D, E, T);
BODY_00_19(7, T, A, B, C, D, E);
const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k);
BODY_00_19(8, E, T, A, B, C, D);
BODY_00_19(9, D, E, T, A, B, C);
BODY_00_19(10, C, D, E, T, A, B);
BODY_00_19(11, B, C, D, E, T, A);
const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k);
BODY_00_19(12, A, B, C, D, E, T);
BODY_00_19(13, T, A, B, C, D, E);
BODY_00_19(14, E, T, A, B, C, D);
BODY_00_19(15, D, E, T, A, B, C);
const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k);
BODY_00_19(16, C, D, E, T, A, B);
BODY_00_19(17, B, C, D, E, T, A);
BODY_00_19(18, A, B, C, D, E, T);
BODY_00_19(19, T, A, B, C, D, E);
k = K_20_39_x_4;
const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k);
BODY_20_39(20, E, T, A, B, C, D);
BODY_20_39(21, D, E, T, A, B, C);
BODY_20_39(22, C, D, E, T, A, B);
BODY_20_39(23, B, C, D, E, T, A);
const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k);
BODY_20_39(24, A, B, C, D, E, T);
BODY_20_39(25, T, A, B, C, D, E);
BODY_20_39(26, E, T, A, B, C, D);
BODY_20_39(27, D, E, T, A, B, C);
const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k);
BODY_20_39(28, C, D, E, T, A, B);
BODY_20_39(29, B, C, D, E, T, A);
BODY_20_39(30, A, B, C, D, E, T);
BODY_20_39(31, T, A, B, C, D, E);
const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k);
BODY_20_39(32, E, T, A, B, C, D);
BODY_20_39(33, D, E, T, A, B, C);
BODY_20_39(34, C, D, E, T, A, B);
BODY_20_39(35, B, C, D, E, T, A);
const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k);
BODY_20_39(36, A, B, C, D, E, T);
BODY_20_39(37, T, A, B, C, D, E);
BODY_20_39(38, E, T, A, B, C, D);
BODY_20_39(39, D, E, T, A, B, C);
k = K_40_59_x_4;
const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k);
BODY_40_59(40, C, D, E, T, A, B);
BODY_40_59(41, B, C, D, E, T, A);
BODY_40_59(42, A, B, C, D, E, T);
BODY_40_59(43, T, A, B, C, D, E);
const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k);
BODY_40_59(44, E, T, A, B, C, D);
BODY_40_59(45, D, E, T, A, B, C);
BODY_40_59(46, C, D, E, T, A, B);
BODY_40_59(47, B, C, D, E, T, A);
const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k);
BODY_40_59(48, A, B, C, D, E, T);
BODY_40_59(49, T, A, B, C, D, E);
BODY_40_59(50, E, T, A, B, C, D);
BODY_40_59(51, D, E, T, A, B, C);
const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k);
BODY_40_59(52, C, D, E, T, A, B);
BODY_40_59(53, B, C, D, E, T, A);
BODY_40_59(54, A, B, C, D, E, T);
BODY_40_59(55, T, A, B, C, D, E);
const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k);
BODY_40_59(56, E, T, A, B, C, D);
BODY_40_59(57, D, E, T, A, B, C);
BODY_40_59(58, C, D, E, T, A, B);
BODY_40_59(59, B, C, D, E, T, A);
k = K_60_79_x_4;
const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k);
BODY_60_79(60, A, B, C, D, E, T);
BODY_60_79(61, T, A, B, C, D, E);
BODY_60_79(62, E, T, A, B, C, D);
BODY_60_79(63, D, E, T, A, B, C);
const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k);
BODY_60_79(64, C, D, E, T, A, B);
BODY_60_79(65, B, C, D, E, T, A);
BODY_60_79(66, A, B, C, D, E, T);
BODY_60_79(67, T, A, B, C, D, E);
const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k);
BODY_60_79(68, E, T, A, B, C, D);
BODY_60_79(69, D, E, T, A, B, C);
BODY_60_79(70, C, D, E, T, A, B);
BODY_60_79(71, B, C, D, E, T, A);
const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k);
BODY_60_79(72, A, B, C, D, E, T);
BODY_60_79(73, T, A, B, C, D, E);
BODY_60_79(74, E, T, A, B, C, D);
BODY_60_79(75, D, E, T, A, B, C);
// We don't use the last value
(void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k);
BODY_60_79(76, C, D, E, T, A, B);
BODY_60_79(77, B, C, D, E, T, A);
BODY_60_79(78, A, B, C, D, E, T);
BODY_60_79(79, T, A, B, C, D, E);
const uint32_t mask = 0xffffffffUL;
state[0] = (state[0] + E) & mask;
state[1] = (state[1] + T) & mask;
state[2] = (state[2] + A) & mask;
state[3] = (state[3] + B) & mask;
state[4] = (state[4] + C) & mask;
data += 64;
if (--num == 0) {
break;
}
A = state[0];
B = state[1];
C = state[2];
D = state[3];
E = state[4];
}
}
#endif // OPENSSL_PPC64LE
#undef K_00_19
#undef K_20_39
#undef K_40_59
#undef K_60_79
#undef F_00_19
#undef F_20_39
#undef F_40_59
#undef F_60_79
#undef BODY_00_19
#undef BODY_20_39
#undef BODY_40_59
#undef BODY_60_79

View File

@@ -0,0 +1,390 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/sha.h>
#include <string.h>
#include <openssl/mem.h>
#include "../../internal.h"
#include "../digest/md32_common.h"
#include "internal.h"
int SHA1_Init(SHA_CTX *sha) {
OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
sha->h[0] = 0x67452301UL;
sha->h[1] = 0xefcdab89UL;
sha->h[2] = 0x98badcfeUL;
sha->h[3] = 0x10325476UL;
sha->h[4] = 0xc3d2e1f0UL;
return 1;
}
int SHA1_Init_from_state(SHA_CTX *sha, const uint8_t h[SHA1_CHAINING_LENGTH],
uint64_t n) {
if (n % ((uint64_t)SHA_CBLOCK * 8) != 0) {
// n is not a multiple of the block size in bits, so it fails
return 0;
}
OPENSSL_memset(sha, 0, sizeof(SHA_CTX));
const size_t out_words = SHA1_CHAINING_LENGTH / 4;
for (size_t i = 0; i < out_words; i++) {
sha->h[i] = CRYPTO_load_u32_be(h);
h += 4;
}
sha->Nh = n >> 32;
sha->Nl = n & 0xffffffff;
return 1;
}
uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA_CTX ctx;
const int ok = SHA1_Init(&ctx) &&
SHA1_Update(&ctx, data, len) &&
SHA1_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC)
static void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
size_t num);
#endif
void SHA1_Transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) {
sha1_block_data_order(c->h, data, 1);
}
int SHA1_Update(SHA_CTX *c, const void *data, size_t len) {
crypto_md32_update(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
&c->Nh, &c->Nl, data, len);
return 1;
}
int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) {
crypto_md32_final(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num,
c->Nh, c->Nl, /*is_big_endian=*/1);
CRYPTO_store_u32_be(out, c->h[0]);
CRYPTO_store_u32_be(out + 4, c->h[1]);
CRYPTO_store_u32_be(out + 8, c->h[2]);
CRYPTO_store_u32_be(out + 12, c->h[3]);
CRYPTO_store_u32_be(out + 16, c->h[4]);
FIPS_service_indicator_update_state();
return 1;
}
int SHA1_get_state(SHA_CTX *ctx, uint8_t out_h[SHA1_CHAINING_LENGTH],
uint64_t *out_n) {
if (ctx->Nl % ((uint64_t)SHA_CBLOCK * 8) != 0) {
// ctx->Nl is not a multiple of the block size in bits, so it fails
return 0;
}
const size_t out_words = SHA1_CHAINING_LENGTH / 4;
for (size_t i = 0; i < out_words; i++) {
CRYPTO_store_u32_be(out_h, ctx->h[i]);
out_h += 4;
}
*out_n = (((uint64_t)ctx->Nh) << 32) + ctx->Nl;
return 1;
}
#define Xupdate(a, ix, ia, ib, ic, id) \
do { \
(a) = ((ia) ^ (ib) ^ (ic) ^ (id)); \
(ix) = (a) = CRYPTO_rotl_u32((a), 1); \
} while (0)
#define K_00_19 0x5a827999UL
#define K_20_39 0x6ed9eba1UL
#define K_40_59 0x8f1bbcdcUL
#define K_60_79 0xca62c1d6UL
// As pointed out by Wei Dai <weidai@eskimo.com>, F() below can be simplified
// to the code in F_00_19. Wei attributes these optimisations to Peter
// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define
// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another
// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a
#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d))
#define F_20_39(b, c, d) ((b) ^ (c) ^ (d))
#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d)))
#define F_60_79(b, c, d) F_20_39(b, c, d)
#define BODY_00_15(i, a, b, c, d, e, f, xi) \
do { \
(f) = (xi) + (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + \
F_00_19((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \
do { \
Xupdate(f, xi, xa, xb, xc, xd); \
(f) += (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + F_00_19((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \
do { \
Xupdate(f, xi, xa, xb, xc, xd); \
(f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd) \
do { \
Xupdate(f, xa, xa, xb, xc, xd); \
(f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd) \
do { \
Xupdate(f, xa, xa, xb, xc, xd); \
(f) += (e) + K_40_59 + CRYPTO_rotl_u32((a), 5) + F_40_59((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd) \
do { \
Xupdate(f, xa, xa, xb, xc, xd); \
(f) = (xa) + (e) + K_60_79 + CRYPTO_rotl_u32((a), 5) + \
F_60_79((b), (c), (d)); \
(b) = CRYPTO_rotl_u32((b), 30); \
} while (0)
#ifdef X
#undef X
#endif
/* Originally X was an array. As it's automatic it's natural
* to expect RISC compiler to accomodate at least part of it in
* the register bank, isn't it? Unfortunately not all compilers
* "find" this expectation reasonable:-( On order to make such
* compilers generate better code I replace X[] with a bunch of
* X0, X1, etc. See the function body below...
* <appro@fy.chalmers.se> */
#define X(i) XX##i
#if !defined(SHA1_ASM) && !defined(SHA1_ALTIVEC)
#if !defined(SHA1_ASM_NOHW)
static void sha1_block_data_order_nohw(uint32_t state[5], const uint8_t *data,
size_t num) {
register uint32_t A, B, C, D, E, T;
uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
XX11, XX12, XX13, XX14, XX15;
A = state[0];
B = state[1];
C = state[2];
D = state[3];
E = state[4];
for (;;) {
X(0) = CRYPTO_load_u32_be(data);
data += 4;
X(1) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(0, A, B, C, D, E, T, X(0));
X(2) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(1, T, A, B, C, D, E, X(1));
X(3) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(2, E, T, A, B, C, D, X(2));
X(4) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(3, D, E, T, A, B, C, X(3));
X(5) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(4, C, D, E, T, A, B, X(4));
X(6) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(5, B, C, D, E, T, A, X(5));
X(7) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(6, A, B, C, D, E, T, X(6));
X(8) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(7, T, A, B, C, D, E, X(7));
X(9) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(8, E, T, A, B, C, D, X(8));
X(10) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(9, D, E, T, A, B, C, X(9));
X(11) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(10, C, D, E, T, A, B, X(10));
X(12) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(11, B, C, D, E, T, A, X(11));
X(13) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(12, A, B, C, D, E, T, X(12));
X(14) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(13, T, A, B, C, D, E, X(13));
X(15) = CRYPTO_load_u32_be(data);
data += 4;
BODY_00_15(14, E, T, A, B, C, D, X(14));
BODY_00_15(15, D, E, T, A, B, C, X(15));
BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13));
BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14));
BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15));
BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0));
BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1));
BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2));
BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3));
BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4));
BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5));
BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6));
BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7));
BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8));
BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9));
BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10));
BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11));
BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12));
BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13));
BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14));
BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15));
BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0));
BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1));
BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2));
BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3));
BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4));
BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5));
BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6));
BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7));
BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8));
BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9));
BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10));
BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11));
BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12));
BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13));
BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14));
BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15));
BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0));
BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1));
BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2));
BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3));
BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4));
BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5));
BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6));
BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7));
BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8));
BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9));
BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10));
BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11));
BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12));
BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13));
BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14));
BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15));
BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0));
BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1));
BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2));
BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3));
BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4));
BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5));
BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6));
BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7));
BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8));
BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9));
BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10));
BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11));
BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12));
state[0] = (state[0] + E) & 0xffffffffL;
state[1] = (state[1] + T) & 0xffffffffL;
state[2] = (state[2] + A) & 0xffffffffL;
state[3] = (state[3] + B) & 0xffffffffL;
state[4] = (state[4] + C) & 0xffffffffL;
if (--num == 0) {
break;
}
A = state[0];
B = state[1];
C = state[2];
D = state[3];
E = state[4];
}
}
#endif // !SHA1_ASM_NOHW
static void sha1_block_data_order(uint32_t state[5], const uint8_t *data,
size_t num) {
#if defined(SHA1_ASM_HW)
if (sha1_hw_capable()) {
sha1_block_data_order_hw(state, data, num);
return;
}
#endif
#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha1_avx2_capable()) {
sha1_block_data_order_avx2(state, data, num);
return;
}
#endif
#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha1_avx_capable()) {
sha1_block_data_order_avx(state, data, num);
return;
}
#endif
#if defined(SHA1_ASM_SSSE3)
if (sha1_ssse3_capable()) {
sha1_block_data_order_ssse3(state, data, num);
return;
}
#endif
#if defined(SHA1_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha1_block_data_order_neon(state, data, num);
return;
}
#endif
sha1_block_data_order_nohw(state, data, num);
}
#endif // !SHA1_ASM && !SHA1_ALTIVEC
#undef Xupdate
#undef K_00_19
#undef K_20_39
#undef K_40_59
#undef K_60_79
#undef F_00_19
#undef F_20_39
#undef F_40_59
#undef F_60_79
#undef BODY_00_15
#undef BODY_16_19
#undef BODY_20_31
#undef BODY_32_39
#undef BODY_40_59
#undef BODY_60_79
#undef X

View File

@@ -0,0 +1,378 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/sha.h>
#include <string.h>
#include <openssl/mem.h>
#include "../../internal.h"
#include "../digest/md32_common.h"
#include "internal.h"
int SHA224_Init(SHA256_CTX *sha) {
OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
sha->h[0] = 0xc1059ed8UL;
sha->h[1] = 0x367cd507UL;
sha->h[2] = 0x3070dd17UL;
sha->h[3] = 0xf70e5939UL;
sha->h[4] = 0xffc00b31UL;
sha->h[5] = 0x68581511UL;
sha->h[6] = 0x64f98fa7UL;
sha->h[7] = 0xbefa4fa4UL;
sha->md_len = SHA224_DIGEST_LENGTH;
return 1;
}
int SHA256_Init(SHA256_CTX *sha) {
OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
sha->h[0] = 0x6a09e667UL;
sha->h[1] = 0xbb67ae85UL;
sha->h[2] = 0x3c6ef372UL;
sha->h[3] = 0xa54ff53aUL;
sha->h[4] = 0x510e527fUL;
sha->h[5] = 0x9b05688cUL;
sha->h[6] = 0x1f83d9abUL;
sha->h[7] = 0x5be0cd19UL;
sha->md_len = SHA256_DIGEST_LENGTH;
return 1;
}
OPENSSL_STATIC_ASSERT(SHA256_CHAINING_LENGTH==SHA224_CHAINING_LENGTH,
sha256_and_sha224_have_same_chaining_length)
// sha256_init_from_state_impl is the implementation of
// SHA256_Init_from_state and SHA224_Init_from_state
// Note that the state h is always SHA256_CHAINING_LENGTH-byte long
static int sha256_init_from_state_impl(SHA256_CTX *sha, int md_len,
const uint8_t h[SHA256_CHAINING_LENGTH],
uint64_t n) {
if(n % ((uint64_t) SHA256_CBLOCK * 8) != 0) {
// n is not a multiple of the block size in bits, so it fails
return 0;
}
OPENSSL_memset(sha, 0, sizeof(SHA256_CTX));
sha->md_len = md_len;
const size_t out_words = SHA256_CHAINING_LENGTH / 4;
for (size_t i = 0; i < out_words; i++) {
sha->h[i] = CRYPTO_load_u32_be(h);
h += 4;
}
sha->Nh = n >> 32;
sha->Nl = n & 0xffffffff;
return 1;
}
int SHA224_Init_from_state(SHA256_CTX *sha,
const uint8_t h[SHA224_CHAINING_LENGTH],
uint64_t n) {
return sha256_init_from_state_impl(sha, SHA224_DIGEST_LENGTH, h, n);
}
int SHA256_Init_from_state(SHA256_CTX *sha,
const uint8_t h[SHA256_CHAINING_LENGTH],
uint64_t n) {
return sha256_init_from_state_impl(sha, SHA256_DIGEST_LENGTH, h, n);
}
uint8_t *SHA224(const uint8_t *data, size_t len,
uint8_t out[SHA224_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA256_CTX ctx;
const int ok = SHA224_Init(&ctx) &&
SHA224_Update(&ctx, data, len) &&
SHA224_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
uint8_t *SHA256(const uint8_t *data, size_t len,
uint8_t out[SHA256_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA256_CTX ctx;
const int ok = SHA256_Init(&ctx) &&
SHA256_Update(&ctx, data, len) &&
SHA256_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
#if !defined(SHA256_ASM)
static void sha256_block_data_order(uint32_t state[8], const uint8_t *in,
size_t num);
#endif
void SHA256_Transform(SHA256_CTX *c, const uint8_t data[SHA256_CBLOCK]) {
sha256_block_data_order(c->h, data, 1);
}
int SHA256_Update(SHA256_CTX *c, const void *data, size_t len) {
crypto_md32_update(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK,
&c->num, &c->Nh, &c->Nl, data, len);
return 1;
}
int SHA224_Update(SHA256_CTX *ctx, const void *data, size_t len) {
return SHA256_Update(ctx, data, len);
}
static int sha256_final_impl(uint8_t *out, size_t md_len, SHA256_CTX *c) {
crypto_md32_final(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK,
&c->num, c->Nh, c->Nl, /*is_big_endian=*/1);
if (c->md_len != md_len) {
return 0;
}
assert(md_len % 4 == 0);
const size_t out_words = md_len / 4;
for (size_t i = 0; i < out_words; i++) {
CRYPTO_store_u32_be(out, c->h[i]);
out += 4;
}
FIPS_service_indicator_update_state();
return 1;
}
int SHA256_Final(uint8_t out[SHA256_DIGEST_LENGTH], SHA256_CTX *c) {
return sha256_final_impl(out, SHA256_DIGEST_LENGTH, c);
}
int SHA224_Final(uint8_t out[SHA224_DIGEST_LENGTH], SHA256_CTX *ctx) {
return sha256_final_impl(out, SHA224_DIGEST_LENGTH, ctx);
}
// sha256_get_state_impl is the implementation of
// SHA256_get_state and SHA224_get_state
// Note that the state out_h is always SHA256_CHAINING_LENGTH-byte long
static int sha256_get_state_impl(SHA256_CTX *ctx,
uint8_t out_h[SHA256_CHAINING_LENGTH],
uint64_t *out_n) {
if (ctx->Nl % ((uint64_t)SHA256_CBLOCK * 8) != 0) {
// ctx->Nl is not a multiple of the block size in bits, so it fails
return 0;
}
const size_t out_words = SHA256_CHAINING_LENGTH / 4;
for (size_t i = 0; i < out_words; i++) {
CRYPTO_store_u32_be(out_h, ctx->h[i]);
out_h += 4;
}
*out_n = (((uint64_t)ctx->Nh) << 32) + ctx->Nl;
return 1;
}
int SHA224_get_state(SHA256_CTX *ctx, uint8_t out_h[SHA224_CHAINING_LENGTH],
uint64_t *out_n) {
return sha256_get_state_impl(ctx, out_h, out_n);
}
int SHA256_get_state(SHA256_CTX *ctx, uint8_t out_h[SHA256_CHAINING_LENGTH],
uint64_t *out_n) {
return sha256_get_state_impl(ctx, out_h, out_n);
}
#if !defined(SHA256_ASM)
#if !defined(SHA256_ASM_NOHW)
static const uint32_t K256[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL,
0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL,
0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL,
0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL,
0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL,
0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL,
0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL};
// See FIPS 180-4, section 4.1.2.
#define Sigma0(x) \
(CRYPTO_rotr_u32((x), 2) ^ CRYPTO_rotr_u32((x), 13) ^ \
CRYPTO_rotr_u32((x), 22))
#define Sigma1(x) \
(CRYPTO_rotr_u32((x), 6) ^ CRYPTO_rotr_u32((x), 11) ^ \
CRYPTO_rotr_u32((x), 25))
#define sigma0(x) \
(CRYPTO_rotr_u32((x), 7) ^ CRYPTO_rotr_u32((x), 18) ^ ((x) >> 3))
#define sigma1(x) \
(CRYPTO_rotr_u32((x), 17) ^ CRYPTO_rotr_u32((x), 19) ^ ((x) >> 10))
#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \
do { \
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; \
h = Sigma0(a) + Maj(a, b, c); \
d += T1; \
h += T1; \
} while (0)
#define ROUND_16_63(i, a, b, c, d, e, f, g, h, X) \
do { \
s0 = X[(i + 1) & 0x0f]; \
s0 = sigma0(s0); \
s1 = X[(i + 14) & 0x0f]; \
s1 = sigma1(s1); \
T1 = X[(i) & 0x0f] += s0 + s1 + X[(i + 9) & 0x0f]; \
ROUND_00_15(i, a, b, c, d, e, f, g, h); \
} while (0)
static void sha256_block_data_order_nohw(uint32_t state[8], const uint8_t *data,
size_t num) {
uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
uint32_t X[16];
int i;
while (num--) {
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
T1 = X[0] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(1, h, a, b, c, d, e, f, g);
T1 = X[2] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(2, g, h, a, b, c, d, e, f);
T1 = X[3] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(3, f, g, h, a, b, c, d, e);
T1 = X[4] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(4, e, f, g, h, a, b, c, d);
T1 = X[5] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(5, d, e, f, g, h, a, b, c);
T1 = X[6] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(6, c, d, e, f, g, h, a, b);
T1 = X[7] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(7, b, c, d, e, f, g, h, a);
T1 = X[8] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(8, a, b, c, d, e, f, g, h);
T1 = X[9] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(9, h, a, b, c, d, e, f, g);
T1 = X[10] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(10, g, h, a, b, c, d, e, f);
T1 = X[11] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(11, f, g, h, a, b, c, d, e);
T1 = X[12] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(12, e, f, g, h, a, b, c, d);
T1 = X[13] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(13, d, e, f, g, h, a, b, c);
T1 = X[14] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = CRYPTO_load_u32_be(data);
data += 4;
ROUND_00_15(15, b, c, d, e, f, g, h, a);
for (i = 16; i < 64; i += 8) {
ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X);
ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X);
ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X);
ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X);
ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X);
ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X);
ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X);
ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
}
#endif // !defined(SHA256_ASM_NOHW)
static void sha256_block_data_order(uint32_t state[8], const uint8_t *data,
size_t num) {
#if defined(SHA256_ASM_HW)
if (sha256_hw_capable()) {
sha256_block_data_order_hw(state, data, num);
return;
}
#endif
#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha256_avx_capable()) {
sha256_block_data_order_avx(state, data, num);
return;
}
#endif
#if defined(SHA256_ASM_SSSE3)
if (sha256_ssse3_capable()) {
sha256_block_data_order_ssse3(state, data, num);
return;
}
#endif
#if defined(SHA256_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha256_block_data_order_neon(state, data, num);
return;
}
#endif
sha256_block_data_order_nohw(state, data, num);
}
#endif // !defined(SHA256_ASM)
void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data,
size_t num_blocks) {
sha256_block_data_order(state, data, num_blocks);
}
#undef Sigma0
#undef Sigma1
#undef sigma0
#undef sigma1
#undef Ch
#undef Maj
#undef ROUND_00_15
#undef ROUND_16_63

View File

@@ -0,0 +1,520 @@
// Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
#include "internal.h"
#include <string.h>
uint8_t *SHA3_224(const uint8_t *data, size_t len,
uint8_t out[SHA3_224_DIGEST_LENGTH]) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHA3_Init(&ctx, SHA3_224_DIGEST_BITLENGTH) &&
SHA3_Update(&ctx, data, len) &&
SHA3_Final(out, &ctx));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
uint8_t *SHA3_256(const uint8_t *data, size_t len,
uint8_t out[SHA3_256_DIGEST_LENGTH]) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHA3_Init(&ctx, SHA3_256_DIGEST_BITLENGTH) &&
SHA3_Update(&ctx, data, len) &&
SHA3_Final(out, &ctx));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
uint8_t *SHA3_384(const uint8_t *data, size_t len,
uint8_t out[SHA3_384_DIGEST_LENGTH]) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHA3_Init(&ctx, SHA3_384_DIGEST_BITLENGTH) &&
SHA3_Update(&ctx, data, len) &&
SHA3_Final(out, &ctx));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
uint8_t *SHA3_512(const uint8_t *data, size_t len,
uint8_t out[SHA3_512_DIGEST_LENGTH]) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHA3_Init(&ctx, SHA3_512_DIGEST_BITLENGTH) &&
SHA3_Update(&ctx, data, len) &&
SHA3_Final(out, &ctx));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
uint8_t *SHAKE128(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHAKE_Init(&ctx, SHAKE128_BLOCKSIZE) &&
SHAKE_Absorb(&ctx, data, in_len) &&
SHAKE_Final(out, &ctx, out_len));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
uint8_t *SHAKE256(const uint8_t *data, const size_t in_len, uint8_t *out, size_t out_len) {
FIPS_service_indicator_lock_state();
KECCAK1600_CTX ctx;
int ok = (SHAKE_Init(&ctx, SHAKE256_BLOCKSIZE) &&
SHAKE_Absorb(&ctx, data, in_len) &&
SHAKE_Final(out, &ctx, out_len));
OPENSSL_cleanse(&ctx, sizeof(ctx));
FIPS_service_indicator_unlock_state();
if (ok == 0) {
return NULL;
}
FIPS_service_indicator_update_state();
return out;
}
/*
* FIPS202 APIs manage internal input/output buffer on top of Keccak1600 API layer
*/
// FIPS202_Reset zero's |ctx| fields.
static void FIPS202_Reset(KECCAK1600_CTX *ctx) {
OPENSSL_memset(ctx->A, 0, sizeof(ctx->A));
ctx->buf_load = 0;
ctx->state = KECCAK1600_STATE_ABSORB;
}
// FIPS202_Init checks the correctness of the padding character and size of
// the internal buffer. It initialises the |ctx| fields and returns 1 on
// success and 0 on failure.
static int FIPS202_Init(KECCAK1600_CTX *ctx, uint8_t pad, size_t block_size, size_t bit_len) {
if (pad != SHA3_PAD_CHAR &&
pad != SHAKE_PAD_CHAR) {
return 0;
}
if (block_size <= sizeof(ctx->buf)) {
FIPS202_Reset(ctx);
ctx->block_size = block_size;
ctx->md_size = bit_len / 8;
ctx->pad = pad;
return 1;
}
return 0;
}
// FIPS202_Update checks the state of the |ctx| and processes intermediate buffer from
// previous calls. It processes |data| in blocks through |Keccak1600_Absorb| and places
// the rest in the intermediate buffer. FIPS202_Update fails if called from inappropriate
// |ctx->state| or on |Keccak1600_Absorb| error. Otherwise, it returns 1.
static int FIPS202_Update(KECCAK1600_CTX *ctx, const void *data, size_t len) {
uint8_t *data_ptr_copy = (uint8_t *) data;
size_t block_size = ctx->block_size;
size_t num, rem;
if (ctx->state == KECCAK1600_STATE_SQUEEZE ||
ctx->state == KECCAK1600_STATE_FINAL ) {
return 0;
}
// Case |len| equals 0 is checked in SHA3/SHAKE higher level APIs
// Process intermediate buffer.
num = ctx->buf_load;
if (num != 0) {
rem = block_size - num;
if (len < rem) {
OPENSSL_memcpy(ctx->buf + num, data_ptr_copy, len);
ctx->buf_load += len;
return 1;
}
// There is enough data to fill or overflow the intermediate
// buffer. So we append |rem| bytes and process the block,
// leaving the rest for later processing.
OPENSSL_memcpy(ctx->buf + num, data_ptr_copy, rem);
data_ptr_copy += rem, len -= rem;
if (Keccak1600_Absorb(ctx->A, ctx->buf, block_size, block_size) != 0 ) {
return 0;
}
ctx->buf_load = 0;
// ctx->buf is processed, ctx->buf_load is guaranteed to be zero
}
if (len >= block_size) {
rem = Keccak1600_Absorb(ctx->A, data_ptr_copy, len, block_size);
}
else {
rem = len;
}
if (rem != 0) {
OPENSSL_memcpy(ctx->buf, data_ptr_copy + len - rem, rem);
ctx->buf_load = rem;
}
return 1;
}
// FIPS202_Finalize processes padding and absorb of last input block
// This function should be called once to finalize absorb and initiate
// squeeze phase. FIPS202_Finalize fails if called from inappropriate
// |ctx->state| or on |Keccak1600_Absorb| error. Otherwise, it returns 1.
static int FIPS202_Finalize(uint8_t *md, KECCAK1600_CTX *ctx) {
size_t block_size = ctx->block_size;
size_t num = ctx->buf_load;
if (ctx->state == KECCAK1600_STATE_SQUEEZE ||
ctx->state == KECCAK1600_STATE_FINAL ) {
return 0;
}
// Pad the data with 10*1. Note that |num| can be |block_size - 1|
// in which case both byte operations below are performed on
// the same byte.
OPENSSL_memset(ctx->buf + num, 0, block_size - num);
ctx->buf[num] = ctx->pad;
ctx->buf[block_size - 1] |= 0x80;
if (Keccak1600_Absorb(ctx->A, ctx->buf, block_size, block_size) != 0) {
return 0;
}
// ctx->buf is processed, ctx->buf_load is guaranteed to be zero
ctx->buf_load = 0;
return 1;
}
/*
* SHA3 APIs implement SHA3 functionalities on top of FIPS202 API layer
*/
int SHA3_Init(KECCAK1600_CTX *ctx, size_t bit_len) {
if (ctx == NULL) {
return 0;
}
if (bit_len != SHA3_224_DIGEST_BITLENGTH &&
bit_len != SHA3_256_DIGEST_BITLENGTH &&
bit_len != SHA3_384_DIGEST_BITLENGTH &&
bit_len != SHA3_512_DIGEST_BITLENGTH) {
return 0;
}
// |block_size| depends on the SHA3 |bit_len| output (digest) length
return FIPS202_Init(ctx, SHA3_PAD_CHAR, SHA3_BLOCKSIZE(bit_len), bit_len);
}
int SHA3_Update(KECCAK1600_CTX *ctx, const void *data, size_t len) {
if (ctx == NULL) {
return 0;
}
if (data == NULL && len != 0) {
return 0;
}
if (len == 0) {
return 1;
}
return FIPS202_Update(ctx, data, len);
}
// SHA3_Final should be called once to process final digest value
int SHA3_Final(uint8_t *md, KECCAK1600_CTX *ctx) {
if (md == NULL || ctx == NULL) {
return 0;
}
if (ctx->md_size == 0) {
return 1;
}
if (FIPS202_Finalize(md, ctx) == 0) {
return 0;
}
Keccak1600_Squeeze(ctx->A, md, ctx->md_size, ctx->block_size, ctx->state);
ctx->state = KECCAK1600_STATE_FINAL;
FIPS_service_indicator_update_state();
return 1;
}
int SHA3_224_Init(KECCAK1600_CTX *ctx) {
return SHA3_Init(ctx, SHA3_224_DIGEST_BITLENGTH);
}
int SHA3_224_Update(KECCAK1600_CTX *ctx, const void *data,
size_t len) {
return SHA3_Update(ctx, data, len);
}
int SHA3_224_Final(uint8_t out[SHA3_224_DIGEST_LENGTH],
KECCAK1600_CTX *ctx) {
return SHA3_Final(&out[0], ctx);
}
int SHA3_256_Init(KECCAK1600_CTX *ctx) {
return SHA3_Init(ctx, SHA3_256_DIGEST_BITLENGTH);
}
int SHA3_256_Update(KECCAK1600_CTX *ctx, const void *data,
size_t len) {
return SHA3_Update(ctx, data, len);
}
int SHA3_256_Final(uint8_t out[SHA3_256_DIGEST_LENGTH],
KECCAK1600_CTX *ctx) {
return SHA3_Final(&out[0], ctx);
}
int SHA3_384_Init(KECCAK1600_CTX *ctx) {
return SHA3_Init(ctx, SHA3_384_DIGEST_BITLENGTH);
}
int SHA3_384_Update(KECCAK1600_CTX *ctx, const void *data,
size_t len) {
return SHA3_Update(ctx, data, len);
}
int SHA3_384_Final(uint8_t out[SHA3_384_DIGEST_LENGTH],
KECCAK1600_CTX *ctx) {
return SHA3_Final(&out[0], ctx);
}
int SHA3_512_Init(KECCAK1600_CTX *ctx) {
return SHA3_Init(ctx, SHA3_512_DIGEST_BITLENGTH);
}
int SHA3_512_Update(KECCAK1600_CTX *ctx, const void *data,
size_t len) {
return SHA3_Update(ctx, data, len);
}
int SHA3_512_Final(uint8_t out[SHA3_512_DIGEST_LENGTH],
KECCAK1600_CTX *ctx) {
return SHA3_Final(&out[0], ctx);
}
/*
* SHAKE APIs implement SHAKE functionalities on top of FIPS202 API layer
*/
int SHAKE_Init(KECCAK1600_CTX *ctx, size_t block_size) {
if (ctx == NULL) {
return 0;
}
if (block_size != SHAKE128_BLOCKSIZE &&
block_size != SHAKE256_BLOCKSIZE) {
return 0;
}
// |block_size| depends on the SHAKE security level
// The output length |bit_len| is initialized to 0
return FIPS202_Init(ctx, SHAKE_PAD_CHAR, block_size, 0);
}
int SHAKE_Absorb(KECCAK1600_CTX *ctx, const void *data, size_t len) {
if (ctx == NULL) {
return 0;
}
if (data == NULL && len != 0) {
return 0;
}
if (len == 0) {
return 1;
}
return FIPS202_Update(ctx, data, len);
}
// SHAKE_Final is to be called once to finalize absorb and squeeze phases
// |ctx->state| restricts consecutive calls to |FIPS202_Finalize|.
// Function |SHAKE_Squeeze| should be used for incremental XOF output.
int SHAKE_Final(uint8_t *md, KECCAK1600_CTX *ctx, size_t len) {
if (ctx == NULL || md == NULL) {
return 0;
}
ctx->md_size = len;
if (ctx->md_size == 0) {
return 1;
}
if (FIPS202_Finalize(md, ctx) == 0) {
return 0;
}
Keccak1600_Squeeze(ctx->A, md, ctx->md_size, ctx->block_size, ctx->state);
ctx->state = KECCAK1600_STATE_FINAL;
FIPS_service_indicator_update_state();
return 1;
}
// SHAKE_Squeeze can be called multiple time for incremental XOF output
int SHAKE_Squeeze(uint8_t *md, KECCAK1600_CTX *ctx, size_t len) {
size_t block_bytes;
if (ctx == NULL || md == NULL) {
return 0;
}
ctx->md_size = len;
if (ctx->md_size == 0) {
return 1;
}
if (ctx->state == KECCAK1600_STATE_FINAL) {
return 0;
}
// Skip FIPS202_Finalize if the input has been padded and
// the last block has been processed
if (ctx->state == KECCAK1600_STATE_ABSORB) {
if (FIPS202_Finalize(md, ctx) == 0) {
return 0;
}
}
// Process previous data from output buffer if any
if (ctx->buf_load != 0) {
if (len <= ctx->buf_load) {
OPENSSL_memcpy(md, ctx->buf + ctx->block_size - ctx->buf_load, len);
ctx->buf_load -= len;
return 1;
} else {
OPENSSL_memcpy(md, ctx->buf + ctx->block_size - ctx->buf_load, ctx->buf_load);
md += ctx->buf_load;
len -= ctx->buf_load;
ctx->buf_load = 0;
}
}
// Process all full size output requested blocks
if (len > ctx->block_size) {
block_bytes = ctx->block_size * (len / ctx->block_size);
Keccak1600_Squeeze(ctx->A, md, block_bytes, ctx->block_size, ctx->state);
md += block_bytes;
len -= block_bytes;
ctx->state = KECCAK1600_STATE_SQUEEZE;
}
if (len > 0) {
// Process an additional block if output length is not a multiple of block size.
// Generated output is store in |ctx->buf|. Only requested bytes are transfered
// to the output. The 'unused' output data is kept for processing in a sequenctual
// call to SHAKE_Squeeze (incremental byte-wise SHAKE_Squeeze)
Keccak1600_Squeeze(ctx->A, ctx->buf, ctx->block_size, ctx->block_size, ctx->state);
OPENSSL_memcpy(md, ctx->buf, len);
ctx->buf_load = ctx->block_size - len; // how much there is still in buffer to be consumed
ctx->state = KECCAK1600_STATE_SQUEEZE;
}
//FIPS_service_indicator_update_state();
return 1;
}
/*
* SHAKE batched (x4) APIs implement SHAKE functionalities in batches of four on top of SHAKE API layer
*/
int SHAKE128_Init_x4(KECCAK1600_CTX_x4 *ctx) {
OPENSSL_memset(ctx, 0, sizeof(KECCAK1600_CTX_x4));
return 1;
}
int SHAKE128_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
const void *data2, const void *data3, size_t len) {
Keccak1600_Absorb_once_x4(ctx->A, data0, data1, data2, data3, len,
SHAKE128_BLOCKSIZE, SHAKE_PAD_CHAR);
return 1;
}
int SHAKE128_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
KECCAK1600_CTX_x4 *ctx, size_t blks) {
Keccak1600_Squeezeblocks_x4(ctx->A, md0, md1, md2, md3, blks, SHAKE128_BLOCKSIZE);
return 1;
}
static int SHAKE256_Absorb_once_x4(KECCAK1600_CTX_x4 *ctx, const void *data0, const void *data1,
const void *data2, const void *data3, size_t len) {
Keccak1600_Absorb_once_x4(ctx->A, data0, data1, data2, data3,
len, SHAKE256_BLOCKSIZE, SHAKE_PAD_CHAR);
return 1;
}
static int SHAKE256_Squeezeblocks_x4(uint8_t *md0, uint8_t *md1, uint8_t *md2, uint8_t *md3,
KECCAK1600_CTX_x4 *ctx, size_t blks) {
Keccak1600_Squeezeblocks_x4(ctx->A, md0, md1, md2, md3, blks, SHAKE256_BLOCKSIZE);
return 1;
}
int SHAKE256_x4(const uint8_t *data0, const uint8_t *data1, const uint8_t *data2,
const uint8_t *data3, const size_t in_len,
uint8_t *out0, uint8_t *out1, uint8_t *out2,
uint8_t *out3, size_t out_len) {
KECCAK1600_CTX_x4 ctx;
OPENSSL_memset(&ctx, 0, sizeof(ctx));
size_t nblocks = out_len / SHAKE256_BLOCKSIZE;
uint8_t tmp0[SHAKE256_BLOCKSIZE];
uint8_t tmp1[SHAKE256_BLOCKSIZE];
uint8_t tmp2[SHAKE256_BLOCKSIZE];
uint8_t tmp3[SHAKE256_BLOCKSIZE];
SHAKE256_Absorb_once_x4(&ctx, data0, data1, data2, data3, in_len);
SHAKE256_Squeezeblocks_x4(out0, out1, out2, out3, &ctx, nblocks);
out0 += nblocks * SHAKE256_BLOCKSIZE;
out1 += nblocks * SHAKE256_BLOCKSIZE;
out2 += nblocks * SHAKE256_BLOCKSIZE;
out3 += nblocks * SHAKE256_BLOCKSIZE;
out_len -= nblocks * SHAKE256_BLOCKSIZE;
if (out_len > 0)
{
SHAKE256_Squeezeblocks_x4(tmp0, tmp1, tmp2, tmp3, &ctx, 1);
OPENSSL_memcpy(out0, tmp0, out_len);
OPENSSL_memcpy(out1, tmp1, out_len);
OPENSSL_memcpy(out2, tmp2, out_len);
OPENSSL_memcpy(out3, tmp3, out_len);
}
OPENSSL_cleanse(tmp0, sizeof(tmp0));
OPENSSL_cleanse(tmp1, sizeof(tmp1));
OPENSSL_cleanse(tmp2, sizeof(tmp2));
OPENSSL_cleanse(tmp3, sizeof(tmp3));
OPENSSL_cleanse(&ctx, sizeof(ctx));
return 1;
}

View File

@@ -0,0 +1,585 @@
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#include <openssl/evp.h>
#include <openssl/rand.h>
#include <openssl/sha.h>
#include <gtest/gtest.h>
#include <openssl/digest.h>
#include "../../test/file_test.h"
#include "../../test/test_util.h"
#include "internal.h"
// Set values for input/output lengths used in
// |NISTTestVectors_SHAKESqueeze| test function
#define RAND_BYTES 256
#define RAND_OUT_BYTES 256
#define RAND_BYTES_x4 34
#define RAND_OUT_BLCKS 6
#define BATCHED_x4 4
#define NUM_TESTS 10
// Table containing the length of the output to squeeze for the
// initial call, followed by a output length for each subsequent call.
static const struct {
size_t startsz, incsz;
} stride_tests[] = {
// Test Edge Cases for SHAKE128 with blocksize of 168B
{ 1, 1 },
{ 8, 8 },
{ 9, 9 },
{ 10, 10 },
{ 1, 168 },
{ 1, 168/2 },
{ 1, 168/2-1 },
{ 1, 168/2+1 },
{ 1, 168*3 },
{ 168/2 - 1, 168 },
{ 168/2 - 1, 168-1 },
{ 168/2 - 1, 168+1 },
{ 168/2, 168 },
{ 168/2, 168-1 },
{ 168/2, 168+1 },
{ 168/2 + 1, 168 },
{ 168/2 + 1, 168-1 },
{ 168/2 + 1, 168+1 },
{ 168, 2 },
{ 168, 168 },
{ 168-1, 168 },
{ 168-1, 168-1 },
{ 168-1, 168+1 },
{ 168+1, 168 },
{ 168+1, 168-1 },
{ 168+1, 168+1 },
{ 168*3, 168 },
{ 168*3, 168 + 1 },
{ 168*3, 168 - 1 },
{ 168*3, 168/2 },
{ 168*3, 168/2 + 1 },
{ 168*3, 168/2 - 1 },
// Test Edge Cases for SHAKE256 with blocksize of 136B
{ 1, 136 },
{ 1, 136/2 },
{ 1, 136/2-1 },
{ 1, 136/2+1 },
{ 1, 136*3 },
{ 8, 8 },
{ 9, 9 },
{ 10, 10 },
{ 136/2 - 1, 136 },
{ 136/2 - 1, 136-1 },
{ 136/2 - 1, 136+1 },
{ 136/2, 136 },
{ 136/2, 136-1 },
{ 136/2, 136+1 },
{ 136/2 + 1, 136 },
{ 136/2 + 1, 136-1 },
{ 136/2 + 1, 136+1 },
{ 136, 2 },
{ 136, 136 },
{ 136-1, 136 },
{ 136-1, 136-1 },
{ 136-1, 136+1 },
{ 136+1, 136 },
{ 136+1, 136-1 },
{ 136+1, 136+1 },
{ 136*3, 136 },
{ 136*3, 136 + 1 },
{ 136*3, 136 - 1 },
{ 136*3, 136/2 },
{ 136*3, 136/2 + 1 },
{ 136*3, 136/2 - 1 }
};
// SHA3TestVector corresponds to one test case of the NIST published file
// SHA3_256ShortMsg.txt.
// https://csrc.nist.gov/projects/cryptographic-algorithm-validation-program/secure-hashing
class SHA3TestVector {
public:
explicit SHA3TestVector() = default;
~SHA3TestVector() = default;
bool ReadFromFileTest(FileTest *t);
void NISTTestVectors(const EVP_MD *algorithm) const {
uint32_t digest_length;
std::unique_ptr<uint8_t[]> digest(new uint8_t[EVP_MD_size(algorithm)]);
bssl::ScopedEVP_MD_CTX ctx;
// Test the correctness via the Init, Update and Final Digest APIs.
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), len_ / 8));
ASSERT_TRUE(EVP_DigestFinal(ctx.get(), digest.get(), &digest_length));
ASSERT_EQ(Bytes(digest.get(), EVP_MD_size(algorithm)),
Bytes(digest_.data(), EVP_MD_size(algorithm)));
// Test XOF-specific Digest functions with non XOF algorithms
// Assert failure when |EVP_DigestSqueeze| or |EVP_DigestFinalXOF|
// are called with digests different from XOF digests
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
ASSERT_FALSE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
}
void NISTTestVectors_SingleShot(const EVP_MD *algorithm) const {
uint32_t digest_length;
std::unique_ptr<uint8_t[]> digest(new uint8_t[EVP_MD_size(algorithm)]);
// Test the correctness via the Single-Shot EVP_Digest APIs.
ASSERT_TRUE(EVP_Digest(msg_.data(), len_ / 8, digest.get(), &digest_length,
algorithm, nullptr));
ASSERT_EQ(Bytes(digest.get(), EVP_MD_size(algorithm)),
Bytes(digest_.data(), EVP_MD_size(algorithm)));
}
void NISTTestVectors_SHAKE(const EVP_MD *algorithm) const {
uint32_t digest_length = out_len_ / 8;
std::unique_ptr<uint8_t[]> digest(new uint8_t[digest_length]);
bssl::ScopedEVP_MD_CTX ctx;
// Test the incremental EVP API
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
// Test the one-shot
ASSERT_TRUE(EVP_Digest(msg_.data(), msg_.size(), digest.get(),
&digest_length, algorithm, nullptr));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
}
// Test SHAKE Squeeze functionality through |EVP_Digest| APIs
void NISTTestVectors_SHAKESqueeze(const EVP_MD *algorithm) const {
uint8_t random_bytes[RAND_BYTES];
size_t sqd_bytes = 0, cur_test = 0, to_sq_bytes = 0;
uint32_t digest_length = out_len_ / 8;
std::unique_ptr<uint8_t[]> digest(new uint8_t[digest_length]);
std::unique_ptr<uint8_t[]> digest_stream(new uint8_t[RAND_OUT_BYTES]);
std::unique_ptr<uint8_t[]> digest_signle_shot(new uint8_t[RAND_OUT_BYTES]);
bssl::ScopedEVP_MD_CTX ctx;
// Test Final XOF
// Assert fail when |EVP_DigestFinalXOF| is called as a streaming API
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
ASSERT_FALSE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
// Test the one-shot
// Assert success when |EVP_Digest| is called
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_Digest(msg_.data(), msg_.size(), digest.get(),
&digest_length, algorithm, nullptr));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
// Test Final
// Assert fail when |EVP_DigestFinal| is called for XOF algorithms
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
ASSERT_FALSE(EVP_DigestFinal(ctx.get(), digest.get(), &digest_length));
ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
// Test Final XOF after Squeeze
// Assert fail when |EVP_DigestFinalXOF| is called after |EVP_DigestSqueeze|
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length/2));
ASSERT_FALSE(EVP_DigestFinalXOF(ctx.get(), digest.get() + digest_length/2,
digest_length/2));
// Test Update after Squeeze
// Assert fail when |EVP_DigestUpdate| is called after |EVP_DigestSqueeze|
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get(), digest_length));
ASSERT_FALSE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
// Test Absorb
// Assert success when |EVP_DigestUpdate| is called byte-by-byte
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), nullptr, 0));
for (const char p : msg_) {
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), &p, 1));
}
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest.get(), digest_length));
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
// Test Squeeze
// Assert success when |EVP_DigestSqueeze| is called byte-by-byte
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
for (size_t i = 0; i < digest_length; i++) {
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + i, 1));
}
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
// Test Squeeze
// Assert success when |EVP_DigestSqueeze| is called in set byte increments
for (cur_test = 0, sqd_bytes = 0; cur_test < (int) (sizeof(stride_tests)/sizeof(stride_tests[0])); cur_test++, sqd_bytes = 0) {
to_sq_bytes = stride_tests[cur_test].startsz;
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
while (sqd_bytes < digest_length) {
if ((sqd_bytes + to_sq_bytes) > digest_length) {
to_sq_bytes = digest_length - sqd_bytes;
}
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, to_sq_bytes));
sqd_bytes += to_sq_bytes;
to_sq_bytes = stride_tests[cur_test].incsz;
}
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
}
// Test Squeeze Exhaustive
// Assert success when |EVP_DigestSqueeze| is called in all possible byte increments
for (to_sq_bytes = 1; to_sq_bytes < digest_length; to_sq_bytes++) {
OPENSSL_memset(digest.get(), 0, digest_length);
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), msg_.data(), msg_.size()));
for (sqd_bytes = 0; sqd_bytes <= digest_length - to_sq_bytes; sqd_bytes+=to_sq_bytes) {
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, to_sq_bytes));
}
if ((digest_length - sqd_bytes) > 0) {
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest.get() + sqd_bytes, digest_length - sqd_bytes));
}
EXPECT_EQ(Bytes(digest.get(), digest_length),
Bytes(digest_.data(), digest_length));
}
// Test Squeeze with random Input
// Assert success when |EVP_DigestSqueeze| is called on a random message
ASSERT_TRUE(RAND_bytes(random_bytes, RAND_BYTES));
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
for (size_t i = 0; i < RAND_OUT_BYTES; i++) {
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + i, 1));
}
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
// Test Squeeze with random Input
// Assert success when |EVP_DigestSqueeze| is called on a random message
// in set byte increments
for (cur_test = 0, sqd_bytes = 0; cur_test < (int) (sizeof(stride_tests)/sizeof(stride_tests[0])); cur_test++, sqd_bytes = 0) {
to_sq_bytes = stride_tests[cur_test].startsz;
OPENSSL_memset(digest_stream.get(), 0, RAND_OUT_BYTES);
OPENSSL_memset(digest_signle_shot.get(), 0, RAND_OUT_BYTES);
ASSERT_TRUE(RAND_bytes(random_bytes, RAND_BYTES));
// Incremental Squeezes
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
while (sqd_bytes < RAND_OUT_BYTES) {
if ((sqd_bytes + to_sq_bytes) > RAND_OUT_BYTES) {
to_sq_bytes = RAND_OUT_BYTES - sqd_bytes;
}
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + sqd_bytes, to_sq_bytes));
sqd_bytes += to_sq_bytes;
to_sq_bytes = stride_tests[cur_test].incsz;
}
// Single-Shot Squeeze
ASSERT_TRUE(EVP_DigestInit_ex(ctx.get(), algorithm, NULL));
ASSERT_TRUE(EVP_DigestUpdate(ctx.get(), random_bytes, RAND_BYTES));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
}
// Test Final XOF without Update
// Assert fail when |EVP_DigestFinalXOF| is called as a streaming API
OPENSSL_memset(digest_signle_shot.get(), 0, RAND_OUT_BYTES);
OPENSSL_memset(digest_stream.get(), 0, RAND_OUT_BYTES);
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestFinalXOF(ctx.get(), digest_signle_shot.get(), RAND_OUT_BYTES));
ASSERT_TRUE(EVP_DigestInit(ctx.get(), algorithm));
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get(), RAND_OUT_BYTES/2));
ASSERT_TRUE(EVP_DigestSqueeze(ctx.get(), digest_stream.get() + RAND_OUT_BYTES/2,
RAND_OUT_BYTES/2));
EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_stream.get(), RAND_OUT_BYTES)),
EncodeHex(bssl::MakeConstSpan(digest_signle_shot.get(), RAND_OUT_BYTES)));
}
private:
uint32_t len_;
uint32_t out_len_;
std::vector<uint8_t> msg_;
std::vector<uint8_t> digest_;
};
// Read the |key| attribute from |file_test| and convert it to an integer.
template <typename T>
bool FileTestReadInt(FileTest *file_test, T *out, const std::string &key) {
std::string s;
return file_test->GetAttribute(&s, key) &&
testing::internal::ParseInt32(
testing::Message() << "The value " << s.data()
<< " is not convertable to an integer.",
s.data(), (int *)out);
}
bool SHA3TestVector::ReadFromFileTest(FileTest *t) {
if (t->HasAttribute("Outputlen")) {
if (!FileTestReadInt(t, &out_len_, "Outputlen")) {
return false;
}
}
if (t->HasAttribute("Len")) {
if (!FileTestReadInt(t, &len_, "Len")) {
return false;
}
}
if (!t->GetBytes(&msg_, "Msg") || !t->GetBytes(&digest_, "MD")) {
return false;
}
return true;
}
TEST(SHA3Test, NISTTestVectors) {
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_224();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_256();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_384();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_512();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224LongMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_224();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256LongMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_256();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384LongMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_384();
test_vec.NISTTestVectors(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512LongMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_512();
test_vec.NISTTestVectors(algorithm);
});
}
TEST(SHA3Test, NISTTestVectors_SingleShot) {
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_224ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_224();
test_vec.NISTTestVectors_SingleShot(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_256ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_256();
test_vec.NISTTestVectors_SingleShot(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_384ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_384();
test_vec.NISTTestVectors_SingleShot(algorithm);
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHA3_512ShortMsg.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
const EVP_MD *algorithm = EVP_sha3_512();
test_vec.NISTTestVectors_SingleShot(algorithm);
});
}
TEST(KeccakInternalTest, SqueezeOutputBufferOverflow) {
EVP_MD_unstable_sha3_enable(true);
KECCAK1600_CTX ctx;
std::vector<uint8_t> out;
std::vector<uint8_t> canary(8);
std::fill(canary.begin(), canary.end(), 0xff);
const size_t out_lens[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, (1 << 5), (1 << 16) + 1};
for (auto out_len : out_lens) {
EXPECT_TRUE(SHA3_Init(&ctx, SHA3_384_DIGEST_BITLENGTH));
out.resize(out_len + canary.size());
std::copy(canary.begin(), canary.end(), out.end() - canary.size());
Keccak1600_Squeeze(ctx.A, out.data(), out_len, ctx.block_size, 1);
EXPECT_TRUE(std::equal(out.end() - canary.size(), out.end(),
canary.begin()) == true);
}
EVP_MD_unstable_sha3_enable(false);
}
// Test x4 batched SHAKE against 4 consecutive SHAKE calls
// Assert success when digest and digest_x4 values are equal
TEST(SHAKETest_x4, RandomMessages) {
KECCAK1600_CTX_x4 ctx;
uint8_t random_in[BATCHED_x4][RAND_BYTES_x4];
uint8_t digest[BATCHED_x4][RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE];
uint8_t digest_x4[BATCHED_x4][RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE];
// Test |SHAKE128_Init_x4|, |SHAKE128_Absorb_once_x4|, and |SHAKE128_Squeezeblocks_x4| functions
// Assert success when digest and digest_x4 values are equal
for (int i = 0; i < NUM_TESTS; i++) {
for (int j = 0; j < BATCHED_x4; j++) {
OPENSSL_memset(digest[j], 0, RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE);
OPENSSL_memset(digest_x4[j], 0, RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE);
ASSERT_TRUE(RAND_bytes(random_in[j], RAND_BYTES_x4));
ASSERT_TRUE(SHAKE128(random_in[j], RAND_BYTES_x4, digest[j],
RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE));
}
// Compute one batched x4 SHAKE128
ASSERT_TRUE(SHAKE128_Init_x4(&ctx));
ASSERT_TRUE(SHAKE128_Absorb_once_x4(&ctx, random_in[0], random_in[1], random_in[2], random_in[3],
RAND_BYTES_x4));
ASSERT_TRUE(SHAKE128_Squeezeblocks_x4(digest_x4[0], digest_x4[1], digest_x4[2], digest_x4[3],
&ctx, RAND_OUT_BLCKS));
for (int j = 0; j < BATCHED_x4; j++) {
EXPECT_EQ(Bytes(digest_x4[j], RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE),
Bytes(digest[j], RAND_OUT_BLCKS * SHAKE128_BLOCKSIZE));
}
}
// Test |SHAKE256_x4| function
// Assert success when digest and digest_x4 values are equal
for (int i = 0; i < NUM_TESTS; i++) {
for (int j = 0; j < BATCHED_x4; j++) {
OPENSSL_memset(digest[j], 0, RAND_OUT_BLCKS);
OPENSSL_memset(digest_x4[j], 0, RAND_OUT_BLCKS);
ASSERT_TRUE(RAND_bytes(random_in[j], RAND_BYTES_x4));
SHAKE256(random_in[j], RAND_BYTES_x4, digest[j], RAND_OUT_BLCKS);
}
// Compute one batched x4 SHAKE128
ASSERT_TRUE(SHAKE256_x4(random_in[0], random_in[1], random_in[2], random_in[3], RAND_BYTES_x4,
digest_x4[0], digest_x4[1], digest_x4[2], digest_x4[3], RAND_OUT_BLCKS));
for (int j = 0; j < BATCHED_x4; j++) {
EXPECT_EQ(EncodeHex(bssl::MakeConstSpan(digest_x4[j], RAND_OUT_BLCKS)),
EncodeHex(bssl::MakeConstSpan(digest[j], RAND_OUT_BLCKS)));
}
}
}
TEST(SHAKETest, NISTTestVectors) {
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE128VariableOut.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
test_vec.NISTTestVectors_SHAKE(EVP_shake128());
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE256VariableOut.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
test_vec.NISTTestVectors_SHAKE(EVP_shake256());
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE128VariableOut.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
test_vec.NISTTestVectors_SHAKESqueeze(EVP_shake128());
});
FileTestGTest("crypto/fipsmodule/sha/testvectors/SHAKE256VariableOut.txt",
[](FileTest *t) {
SHA3TestVector test_vec;
EXPECT_TRUE(test_vec.ReadFromFileTest(t));
test_vec.NISTTestVectors_SHAKESqueeze(EVP_shake256());
});
}

View File

@@ -0,0 +1,663 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/sha.h>
#include <string.h>
#include <openssl/mem.h>
#include "internal.h"
#include "../../internal.h"
// The 32-bit hash algorithms share a common byte-order neutral collector and
// padding function implementations that operate on unaligned data,
// ../digest/md32_common.h. SHA-512 is the only 64-bit hash algorithm, as of
// this writing, so there is no need for a common collector/padding
// implementation yet.
static int sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha);
int SHA384_Init(SHA512_CTX *sha) {
sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8);
sha->h[1] = UINT64_C(0x629a292a367cd507);
sha->h[2] = UINT64_C(0x9159015a3070dd17);
sha->h[3] = UINT64_C(0x152fecd8f70e5939);
sha->h[4] = UINT64_C(0x67332667ffc00b31);
sha->h[5] = UINT64_C(0x8eb44a8768581511);
sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7);
sha->h[7] = UINT64_C(0x47b5481dbefa4fa4);
sha->Nl = 0;
sha->Nh = 0;
sha->num = 0;
sha->md_len = SHA384_DIGEST_LENGTH;
return 1;
}
int SHA512_Init(SHA512_CTX *sha) {
sha->h[0] = UINT64_C(0x6a09e667f3bcc908);
sha->h[1] = UINT64_C(0xbb67ae8584caa73b);
sha->h[2] = UINT64_C(0x3c6ef372fe94f82b);
sha->h[3] = UINT64_C(0xa54ff53a5f1d36f1);
sha->h[4] = UINT64_C(0x510e527fade682d1);
sha->h[5] = UINT64_C(0x9b05688c2b3e6c1f);
sha->h[6] = UINT64_C(0x1f83d9abfb41bd6b);
sha->h[7] = UINT64_C(0x5be0cd19137e2179);
sha->Nl = 0;
sha->Nh = 0;
sha->num = 0;
sha->md_len = SHA512_DIGEST_LENGTH;
return 1;
}
int SHA512_224_Init(SHA512_CTX *sha) {
sha->h[0] = UINT64_C(0x8c3d37c819544da2);
sha->h[1] = UINT64_C(0x73e1996689dcd4d6);
sha->h[2] = UINT64_C(0x1dfab7ae32ff9c82);
sha->h[3] = UINT64_C(0x679dd514582f9fcf);
sha->h[4] = UINT64_C(0x0f6d2b697bd44da8);
sha->h[5] = UINT64_C(0x77e36f7304c48942);
sha->h[6] = UINT64_C(0x3f9d85a86a1d36c8);
sha->h[7] = UINT64_C(0x1112e6ad91d692a1);
sha->Nl = 0;
sha->Nh = 0;
sha->num = 0;
sha->md_len = SHA512_224_DIGEST_LENGTH;
return 1;
}
int SHA512_256_Init(SHA512_CTX *sha) {
sha->h[0] = UINT64_C(0x22312194fc2bf72c);
sha->h[1] = UINT64_C(0x9f555fa3c84c64c2);
sha->h[2] = UINT64_C(0x2393b86b6f53b151);
sha->h[3] = UINT64_C(0x963877195940eabd);
sha->h[4] = UINT64_C(0x96283ee2a88effe3);
sha->h[5] = UINT64_C(0xbe5e1e2553863992);
sha->h[6] = UINT64_C(0x2b0199fc2c85b8aa);
sha->h[7] = UINT64_C(0x0eb72ddc81c52ca2);
sha->Nl = 0;
sha->Nh = 0;
sha->num = 0;
sha->md_len = SHA512_256_DIGEST_LENGTH;
return 1;
}
OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA384_CHAINING_LENGTH,
sha512_and_sha384_have_same_chaining_length)
OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA512_224_CHAINING_LENGTH,
sha512_and_sha512_224_have_same_chaining_length)
OPENSSL_STATIC_ASSERT(SHA512_CHAINING_LENGTH==SHA512_256_CHAINING_LENGTH,
sha512_and_sha512_256_have_same_chaining_length)
// sha512_init_from_state_impl is the implementation of
// SHA512_Init_from_state and SHA224_Init_from_state
// Note that the state h is always SHA512_CHAINING_LENGTH-byte long
static int sha512_init_from_state_impl(SHA512_CTX *sha, int md_len,
const uint8_t h[SHA512_CHAINING_LENGTH],
uint64_t n) {
if(n % ((uint64_t) SHA512_CBLOCK * 8) != 0) {
// n is not a multiple of the block size in bits, so it fails
return 0;
}
OPENSSL_memset(sha, 0, sizeof(SHA512_CTX));
sha->md_len = md_len;
const size_t out_words = SHA512_CHAINING_LENGTH / 8;
for (size_t i = 0; i < out_words; i++) {
sha->h[i] = CRYPTO_load_u64_be(h);
h += 8;
}
sha->Nh = 0;
sha->Nl = n;
return 1;
}
int SHA384_Init_from_state(SHA512_CTX *sha,
const uint8_t h[SHA384_CHAINING_LENGTH],
uint64_t n) {
return sha512_init_from_state_impl(sha, SHA384_DIGEST_LENGTH, h, n);
}
int SHA512_Init_from_state(SHA512_CTX *sha,
const uint8_t h[SHA512_CHAINING_LENGTH],
uint64_t n) {
return sha512_init_from_state_impl(sha, SHA512_DIGEST_LENGTH, h, n);
}
int SHA512_224_Init_from_state(SHA512_CTX *sha,
const uint8_t h[SHA512_224_CHAINING_LENGTH],
uint64_t n) {
return sha512_init_from_state_impl(sha, SHA512_224_DIGEST_LENGTH, h, n);
}
int SHA512_256_Init_from_state(SHA512_CTX *sha,
const uint8_t h[SHA512_256_CHAINING_LENGTH],
uint64_t n) {
return sha512_init_from_state_impl(sha, SHA512_256_DIGEST_LENGTH, h, n);
}
uint8_t *SHA384(const uint8_t *data, size_t len,
uint8_t out[SHA384_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA512_CTX ctx;
const int ok = SHA384_Init(&ctx) &&
SHA384_Update(&ctx, data, len) &&
SHA384_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
uint8_t *SHA512(const uint8_t *data, size_t len,
uint8_t out[SHA512_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA512_CTX ctx;
const int ok = SHA512_Init(&ctx) &&
SHA512_Update(&ctx, data, len) &&
SHA512_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
uint8_t *SHA512_224(const uint8_t *data, size_t len,
uint8_t out[SHA512_224_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA512_CTX ctx;
const int ok = SHA512_224_Init(&ctx) &&
SHA512_224_Update(&ctx, data, len) &&
SHA512_224_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
uint8_t *SHA512_256(const uint8_t *data, size_t len,
uint8_t out[SHA512_256_DIGEST_LENGTH]) {
// We have to verify that all the SHA services actually succeed before
// updating the indicator state, so we lock the state here.
FIPS_service_indicator_lock_state();
SHA512_CTX ctx;
const int ok = SHA512_256_Init(&ctx) &&
SHA512_256_Update(&ctx, data, len) &&
SHA512_256_Final(out, &ctx);
FIPS_service_indicator_unlock_state();
if(ok) {
FIPS_service_indicator_update_state();
}
OPENSSL_cleanse(&ctx, sizeof(ctx));
return out;
}
#if !defined(SHA512_ASM)
static void sha512_block_data_order(uint64_t state[8], const uint8_t *in,
size_t num_blocks);
#endif
int SHA384_Final(uint8_t out[SHA384_DIGEST_LENGTH], SHA512_CTX *sha) {
// This function must be paired with |SHA384_Init|, which sets |sha->md_len|
// to |SHA384_DIGEST_LENGTH|.
assert(sha->md_len == SHA384_DIGEST_LENGTH);
return sha512_final_impl(out, SHA384_DIGEST_LENGTH, sha);
}
int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) {
return SHA512_Update(sha, data, len);
}
int SHA512_224_Update(SHA512_CTX *sha, const void *data, size_t len) {
return SHA512_Update(sha, data, len);
}
int SHA512_224_Final(uint8_t out[SHA512_224_DIGEST_LENGTH], SHA512_CTX *sha) {
// This function must be paired with |SHA512_224_Init|, which sets
// |sha->md_len| to |SHA512_224_DIGEST_LENGTH|.
assert(sha->md_len == SHA512_224_DIGEST_LENGTH);
return sha512_final_impl(out, SHA512_224_DIGEST_LENGTH, sha);
}
int SHA512_256_Update(SHA512_CTX *sha, const void *data, size_t len) {
return SHA512_Update(sha, data, len);
}
int SHA512_256_Final(uint8_t out[SHA512_256_DIGEST_LENGTH], SHA512_CTX *sha) {
// This function must be paired with |SHA512_256_Init|, which sets
// |sha->md_len| to |SHA512_256_DIGEST_LENGTH|.
assert(sha->md_len == SHA512_256_DIGEST_LENGTH);
return sha512_final_impl(out, SHA512_256_DIGEST_LENGTH, sha);
}
void SHA512_Transform(SHA512_CTX *c, const uint8_t block[SHA512_CBLOCK]) {
sha512_block_data_order(c->h, block, 1);
}
int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
uint64_t l;
uint8_t *p = c->p;
const uint8_t *data = in_data;
if (len == 0) {
return 1;
}
l = (c->Nl + (((uint64_t)len) << 3)) & UINT64_C(0xffffffffffffffff);
if (l < c->Nl) {
c->Nh++;
}
if (sizeof(len) >= 8) {
c->Nh += (((uint64_t)len) >> 61);
}
c->Nl = l;
if (c->num != 0) {
size_t n = sizeof(c->p) - c->num;
if (len < n) {
OPENSSL_memcpy(p + c->num, data, len);
c->num += (unsigned int)len;
return 1;
} else {
OPENSSL_memcpy(p + c->num, data, n), c->num = 0;
len -= n;
data += n;
sha512_block_data_order(c->h, p, 1);
}
}
if (len >= sizeof(c->p)) {
sha512_block_data_order(c->h, data, len / sizeof(c->p));
data += len;
len %= sizeof(c->p);
data -= len;
}
if (len != 0) {
OPENSSL_memcpy(p, data, len);
c->num = (int)len;
}
return 1;
}
int SHA512_Final(uint8_t out[SHA512_DIGEST_LENGTH], SHA512_CTX *sha) {
// Ideally we would assert |sha->md_len| is |SHA512_DIGEST_LENGTH| to match
// the size hint, but calling code often pairs |SHA384_Init| with
// |SHA512_Final| and expects |sha->md_len| to carry the size over.
//
// TODO(davidben): Add an assert and fix code to match them up.
return sha512_final_impl(out, sha->md_len, sha);
}
static int sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha) {
uint8_t *p = sha->p;
size_t n = sha->num;
p[n] = 0x80; // There always is a room for one
n++;
if (n > (sizeof(sha->p) - 16)) {
OPENSSL_memset(p + n, 0, sizeof(sha->p) - n);
n = 0;
sha512_block_data_order(sha->h, p, 1);
}
OPENSSL_memset(p + n, 0, sizeof(sha->p) - 16 - n);
CRYPTO_store_u64_be(p + sizeof(sha->p) - 16, sha->Nh);
CRYPTO_store_u64_be(p + sizeof(sha->p) - 8, sha->Nl);
sha512_block_data_order(sha->h, p, 1);
if (out == NULL) {
// TODO(davidben): This NULL check is absent in other low-level hash 'final'
// functions and is one of the few places one can fail.
return 0;
}
const size_t out_words = md_len / 8;
assert(md_len % 8 == 0 || md_len == SHA512_224_DIGEST_LENGTH);
for (size_t i = 0; i < out_words; i++) {
CRYPTO_store_u64_be(out, sha->h[i]);
out += 8;
}
// SHA-512 and SHA-512/256 are aligned to 8-byte words, SHA-512/224 is not.
// If the digest size is not aligned to 8-byte words, we need to process the
// non-word-aligned "trailer".
if (md_len == SHA512_224_DIGEST_LENGTH) {
uint64_t trailer;
CRYPTO_store_u64_be(&trailer, sha->h[out_words]);
OPENSSL_memcpy(out, &trailer, SHA512_224_DIGEST_LENGTH % 8);
}
FIPS_service_indicator_update_state();
return 1;
}
// sha512_get_state_impl is the implementation of
// SHA512_get_state and SHA224_get_state
// Note that the state out_h is always SHA512_CHAINING_LENGTH-byte long
static int sha512_get_state_impl(SHA512_CTX *ctx,
uint8_t out_h[SHA512_CHAINING_LENGTH],
uint64_t *out_n) {
if (ctx->Nl % ((uint64_t)SHA512_CBLOCK * 8) != 0) {
// ctx->Nl is not a multiple of the block size in bits, so it fails
return 0;
}
if (ctx->Nh != 0) {
// |sha512_get_state_impl| assumes that at most 2^64 bits have been
// processed by the hash function
return 0;
}
const size_t out_words = SHA512_CHAINING_LENGTH / 8;
for (size_t i = 0; i < out_words; i++) {
CRYPTO_store_u64_be(out_h, ctx->h[i]);
out_h += 8;
}
*out_n = ctx->Nl; // we know that ctx->Nh = 0
return 1;
}
int SHA384_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA384_CHAINING_LENGTH],
uint64_t *out_n) {
return sha512_get_state_impl(ctx, out_h, out_n);
}
int SHA512_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_CHAINING_LENGTH],
uint64_t *out_n) {
return sha512_get_state_impl(ctx, out_h, out_n);
}
int SHA512_224_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_224_CHAINING_LENGTH],
uint64_t *out_n) {
return sha512_get_state_impl(ctx, out_h, out_n);
}
int SHA512_256_get_state(SHA512_CTX *ctx, uint8_t out_h[SHA512_256_CHAINING_LENGTH],
uint64_t *out_n) {
return sha512_get_state_impl(ctx, out_h, out_n);
}
#if !defined(SHA512_ASM)
#if !defined(SHA512_ASM_NOHW)
static const uint64_t K512[80] = {
UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd),
UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc),
UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019),
UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118),
UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe),
UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2),
UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1),
UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694),
UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3),
UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65),
UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483),
UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5),
UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210),
UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4),
UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725),
UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70),
UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926),
UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df),
UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8),
UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b),
UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001),
UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30),
UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910),
UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8),
UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53),
UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8),
UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb),
UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3),
UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60),
UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec),
UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9),
UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b),
UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207),
UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178),
UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6),
UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b),
UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493),
UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c),
UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a),
UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817),
};
#define Sigma0(x) \
(CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \
CRYPTO_rotr_u64((x), 39))
#define Sigma1(x) \
(CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \
CRYPTO_rotr_u64((x), 41))
#define sigma0(x) \
(CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7))
#define sigma1(x) \
(CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6))
#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
// This code should give better results on 32-bit CPU with less than
// ~24 registers, both size and performance wise...
static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in,
size_t num) {
uint64_t A, E, T;
uint64_t X[9 + 80], *F;
int i;
while (num--) {
F = X + 80;
A = state[0];
F[1] = state[1];
F[2] = state[2];
F[3] = state[3];
E = state[4];
F[5] = state[5];
F[6] = state[6];
F[7] = state[7];
for (i = 0; i < 16; i++, F--) {
T = CRYPTO_load_u64_be(in + i * 8);
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A, F[1], F[2]);
}
for (; i < 80; i++, F--) {
T = sigma0(F[8 + 16 - 1]);
T += sigma1(F[8 + 16 - 14]);
T += F[8 + 16] + F[8 + 16 - 9];
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A, F[1], F[2]);
}
state[0] += A;
state[1] += F[1];
state[2] += F[2];
state[3] += F[3];
state[4] += E;
state[5] += F[5];
state[6] += F[6];
state[7] += F[7];
in += 16 * 8;
}
}
#else
#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \
do { \
T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \
h = Sigma0(a) + Maj(a, b, c); \
d += T1; \
h += T1; \
} while (0)
#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X) \
do { \
s0 = X[(j + 1) & 0x0f]; \
s0 = sigma0(s0); \
s1 = X[(j + 14) & 0x0f]; \
s1 = sigma1(s1); \
T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \
ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \
} while (0)
static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in,
size_t num) {
uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
uint64_t X[16];
int i;
while (num--) {
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
T1 = X[0] = CRYPTO_load_u64_be(in);
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = CRYPTO_load_u64_be(in + 8);
ROUND_00_15(1, h, a, b, c, d, e, f, g);
T1 = X[2] = CRYPTO_load_u64_be(in + 2 * 8);
ROUND_00_15(2, g, h, a, b, c, d, e, f);
T1 = X[3] = CRYPTO_load_u64_be(in + 3 * 8);
ROUND_00_15(3, f, g, h, a, b, c, d, e);
T1 = X[4] = CRYPTO_load_u64_be(in + 4 * 8);
ROUND_00_15(4, e, f, g, h, a, b, c, d);
T1 = X[5] = CRYPTO_load_u64_be(in + 5 * 8);
ROUND_00_15(5, d, e, f, g, h, a, b, c);
T1 = X[6] = CRYPTO_load_u64_be(in + 6 * 8);
ROUND_00_15(6, c, d, e, f, g, h, a, b);
T1 = X[7] = CRYPTO_load_u64_be(in + 7 * 8);
ROUND_00_15(7, b, c, d, e, f, g, h, a);
T1 = X[8] = CRYPTO_load_u64_be(in + 8 * 8);
ROUND_00_15(8, a, b, c, d, e, f, g, h);
T1 = X[9] = CRYPTO_load_u64_be(in + 9 * 8);
ROUND_00_15(9, h, a, b, c, d, e, f, g);
T1 = X[10] = CRYPTO_load_u64_be(in + 10 * 8);
ROUND_00_15(10, g, h, a, b, c, d, e, f);
T1 = X[11] = CRYPTO_load_u64_be(in + 11 * 8);
ROUND_00_15(11, f, g, h, a, b, c, d, e);
T1 = X[12] = CRYPTO_load_u64_be(in + 12 * 8);
ROUND_00_15(12, e, f, g, h, a, b, c, d);
T1 = X[13] = CRYPTO_load_u64_be(in + 13 * 8);
ROUND_00_15(13, d, e, f, g, h, a, b, c);
T1 = X[14] = CRYPTO_load_u64_be(in + 14 * 8);
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = CRYPTO_load_u64_be(in + 15 * 8);
ROUND_00_15(15, b, c, d, e, f, g, h, a);
for (i = 16; i < 80; i += 16) {
ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X);
ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X);
ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X);
ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X);
ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X);
ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X);
ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X);
ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X);
ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X);
ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X);
ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X);
ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X);
ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X);
ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X);
ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
in += 16 * 8;
}
}
#endif
#endif // !SHA512_ASM_NOHW
static void sha512_block_data_order(uint64_t state[8], const uint8_t *data,
size_t num) {
#if defined(SHA512_ASM_HW)
if (sha512_hw_capable()) {
sha512_block_data_order_hw(state, data, num);
return;
}
#endif
#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha512_avx_capable()) {
sha512_block_data_order_avx(state, data, num);
return;
}
#endif
#if defined(SHA512_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
sha512_block_data_order_neon(state, data, num);
return;
}
#endif
sha512_block_data_order_nohw(state, data, num);
}
#endif // !SHA512_ASM
#undef Sigma0
#undef Sigma1
#undef sigma0
#undef sigma1
#undef Ch
#undef Maj
#undef ROUND_00_15
#undef ROUND_16_80

View File

@@ -0,0 +1,119 @@
// Copyright (c) 2018, Google Inc.
// SPDX-License-Identifier: ISC
#include <openssl/sha.h>
#include <vector>
#include <gtest/gtest.h>
#include "../../test/abi_test.h"
#include "internal.h"
#if defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC)
TEST(SHATest, SHA1ABI) {
SHA_CTX ctx;
SHA1_Init(&ctx);
static const uint8_t kBuf[SHA_CBLOCK * 8] = {0};
for (size_t blocks : {1, 2, 4, 8}) {
#if defined(SHA1_ASM)
CHECK_ABI(sha1_block_data_order, ctx.h, kBuf, blocks);
#endif
#if defined(SHA1_ASM_HW)
if (sha1_hw_capable()) {
CHECK_ABI(sha1_block_data_order_hw, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha1_avx2_capable()) {
CHECK_ABI(sha1_block_data_order_avx2, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha1_avx_capable()) {
CHECK_ABI(sha1_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_SSSE3)
if (sha1_ssse3_capable()) {
CHECK_ABI(sha1_block_data_order_ssse3, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha1_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA1_ASM_NOHW)
CHECK_ABI(sha1_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
}
}
TEST(SHATest, SHA256ABI) {
SHA256_CTX ctx;
SHA256_Init(&ctx);
static const uint8_t kBuf[SHA256_CBLOCK * 8] = {0};
for (size_t blocks : {1, 2, 4, 8}) {
#if defined(SHA256_ASM)
CHECK_ABI(sha256_block_data_order, ctx.h, kBuf, blocks);
#endif
#if defined(SHA256_ASM_HW)
if (sha256_hw_capable()) {
CHECK_ABI(sha256_block_data_order_hw, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA256_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha256_avx_capable()) {
CHECK_ABI(sha256_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA256_ASM_SSSE3)
if (sha256_ssse3_capable()) {
CHECK_ABI(sha256_block_data_order_ssse3, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA256_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha256_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA256_ASM_NOHW)
CHECK_ABI(sha256_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
}
}
TEST(SHATest, SHA512ABI) {
SHA512_CTX ctx;
SHA512_Init(&ctx);
static const uint8_t kBuf[SHA512_CBLOCK * 4] = {0};
for (size_t blocks : {1, 2, 3, 4}) {
#if defined(SHA512_ASM)
CHECK_ABI(sha512_block_data_order, ctx.h, kBuf, blocks);
#endif
#if defined(SHA512_ASM_HW)
if (sha512_hw_capable()) {
CHECK_ABI(sha512_block_data_order_hw, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA512_ASM_AVX) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
if (sha512_avx_capable()) {
CHECK_ABI(sha512_block_data_order_avx, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA512_ASM_NEON)
if (CRYPTO_is_NEON_capable()) {
CHECK_ABI(sha512_block_data_order_neon, ctx.h, kBuf, blocks);
}
#endif
#if defined(SHA512_ASM_NOHW)
CHECK_ABI(sha512_block_data_order_nohw, ctx.h, kBuf, blocks);
#endif
}
}
#endif // defined(SUPPORTS_ABI_TEST) && !defined(SHA1_ALTIVEC)