chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,257 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include "internal.h"
int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
const BIGNUM *tmp;
int a_neg = a->neg, ret;
// a + b a+b
// a + -b a-b
// -a + b b-a
// -a + -b -(a+b)
if (a_neg ^ b->neg) {
// only one is negative
if (a_neg) {
tmp = a;
a = b;
b = tmp;
}
// we are now a - b
if (BN_ucmp(a, b) < 0) {
if (!BN_usub(r, b, a)) {
return 0;
}
r->neg = 1;
} else {
if (!BN_usub(r, a, b)) {
return 0;
}
r->neg = 0;
}
return 1;
}
ret = BN_uadd(r, a, b);
r->neg = a_neg;
return ret;
}
int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
// Widths are public, so we normalize to make |a| the larger one.
if (a->width < b->width) {
const BIGNUM *tmp = a;
a = b;
b = tmp;
}
int max = a->width;
int min = b->width;
if (!bn_wexpand(r, max + 1)) {
return 0;
}
r->width = max + 1;
BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min);
for (int i = min; i < max; i++) {
r->d[i] = CRYPTO_addc_w(a->d[i], 0, carry, &carry);
}
r->d[max] = carry;
return 1;
}
int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
if (!bn_uadd_consttime(r, a, b)) {
return 0;
}
bn_set_minimal_width(r);
return 1;
}
int BN_add_word(BIGNUM *a, BN_ULONG w) {
BN_ULONG l;
int i;
// degenerate case: w is zero
if (!w) {
return 1;
}
// degenerate case: a is zero
if (BN_is_zero(a)) {
return BN_set_word(a, w);
}
// handle 'a' when negative
if (a->neg) {
a->neg = 0;
i = BN_sub_word(a, w);
if (!BN_is_zero(a)) {
a->neg = !(a->neg);
}
return i;
}
for (i = 0; w != 0 && i < a->width; i++) {
a->d[i] = l = a->d[i] + w;
w = (w > l) ? 1 : 0;
}
if (w && i == a->width) {
if (!bn_wexpand(a, a->width + 1)) {
return 0;
}
a->width++;
a->d[i] = w;
}
return 1;
}
int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
int add = 0, neg = 0;
const BIGNUM *tmp;
// a - b a-b
// a - -b a+b
// -a - b -(a+b)
// -a - -b b-a
if (a->neg) {
if (b->neg) {
tmp = a;
a = b;
b = tmp;
} else {
add = 1;
neg = 1;
}
} else {
if (b->neg) {
add = 1;
neg = 0;
}
}
if (add) {
if (!BN_uadd(r, a, b)) {
return 0;
}
r->neg = neg;
return 1;
}
if (BN_ucmp(a, b) < 0) {
if (!BN_usub(r, b, a)) {
return 0;
}
r->neg = 1;
} else {
if (!BN_usub(r, a, b)) {
return 0;
}
r->neg = 0;
}
return 1;
}
int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
// |b| may have more words than |a| given non-minimal inputs, but all words
// beyond |a->width| must then be zero.
int b_width = b->width;
if (b_width > a->width) {
if (!bn_fits_in_words(b, a->width)) {
OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3);
return 0;
}
b_width = a->width;
}
if (!bn_wexpand(r, a->width)) {
return 0;
}
BN_ULONG borrow = bn_sub_words(r->d, a->d, b->d, b_width);
for (int i = b_width; i < a->width; i++) {
r->d[i] = CRYPTO_subc_w(a->d[i], 0, borrow, &borrow);
}
if (borrow) {
OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3);
return 0;
}
r->width = a->width;
r->neg = 0;
return 1;
}
int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
if (!bn_usub_consttime(r, a, b)) {
return 0;
}
bn_set_minimal_width(r);
return 1;
}
int BN_sub_word(BIGNUM *a, BN_ULONG w) {
int i;
// degenerate case: w is zero
if (!w) {
return 1;
}
// degenerate case: a is zero
if (BN_is_zero(a)) {
i = BN_set_word(a, w);
if (i != 0) {
BN_set_negative(a, 1);
}
return i;
}
// handle 'a' when negative
if (a->neg) {
a->neg = 0;
i = BN_add_word(a, w);
a->neg = 1;
return i;
}
if ((bn_minimal_width(a) == 1) && (a->d[0] < w)) {
a->d[0] = w - a->d[0];
a->neg = 1;
return 1;
}
i = 0;
for (;;) {
if (a->d[i] >= w) {
a->d[i] -= w;
break;
} else {
a->d[i] -= w;
i++;
w = 1;
}
}
if ((a->d[i] == 0) && (i == (a->width - 1))) {
a->width--;
}
return 1;
}

View File

@@ -0,0 +1,733 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
} else {
open OUT,">$output";
*STDOUT=*OUT;
}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include <openssl/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.arch armv7-a
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.global bn_mul_mont_nohw
.type bn_mul_mont_nohw,%function
.align 5
bn_mul_mont_nohw:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
cmp ip,#2
mov $num,ip @ load num
#ifdef __thumb2__
ittt lt
#endif
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
mov $tj,sp
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,$tj @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
#ifdef __thumb2__
itt ne
#endif
movne $tj,sp
bne .Louter
ldr $rp,[$_rp] @ pull rp
mov $aj,sp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,$aj @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
.Lcopy: ldr $tj,[$tp] @ conditional copy
ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
#ifdef __thumb2__
it cc
#endif
movcc $aj,$tj
str $aj,[$rp],#4
teq $tp,$num @ preserve carry
bne .Lcopy
mov sp,$num
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont_nohw,.-bn_mul_mont_nohw
___
{
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero="$Z#lo";
my $temp="$Temp#lo";
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global bn_mul8x_mont_neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
mov ip,sp
cmp $num,#8
bhi .LNEON_8n
@ special case for $num==8, everything is in register bank...
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
sub $toutptr,sp,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
vzip.16 $Bi,$zero
vmull.u32 @ACC[0],$Bi,${A0}[0]
vmull.u32 @ACC[1],$Bi,${A0}[1]
vmull.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmull.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
vmul.u32 $Ni,$Ni,$M0
vmull.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 @ACC[5],$Bi,${A2}[1]
vmull.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
mov $toutptr,sp
vshr.u64 $temp,@ACC[0]#lo,#16
mov $inner,$num
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
add $tinptr,sp,#96
vshr.u64 $temp,@ACC[0]#hi,#16
vzip.16 @ACC[0]#lo,@ACC[0]#hi
b .LNEON_tail_entry
.align 4
.LNEON_8n:
veor @ACC[0],@ACC[0],@ACC[0]
sub $toutptr,sp,#128
veor @ACC[1],@ACC[1],@ACC[1]
sub $toutptr,$toutptr,$num,lsl#4
veor @ACC[2],@ACC[2],@ACC[2]
and $toutptr,$toutptr,#-64
veor @ACC[3],@ACC[3],@ACC[3]
mov sp,$toutptr @ alloca
veor @ACC[4],@ACC[4],@ACC[4]
add $toutptr,$toutptr,#256
veor @ACC[5],@ACC[5],@ACC[5]
sub $inner,$num,#8
veor @ACC[6],@ACC[6],@ACC[6]
veor @ACC[7],@ACC[7],@ACC[7]
.LNEON_8n_init:
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
subs $inner,$inner,#8
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
bne .LNEON_8n_init
add $tinptr,sp,#256
vld1.32 {$A0-$A3},[$aptr]!
add $bnptr,sp,#8
vld1.32 {${M0}[0]},[$n0,:32]
mov $outer,$num
b .LNEON_8n_outer
.align 4
.LNEON_8n_outer:
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
add $toutptr,sp,#128
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
vmlal.u32 @ACC[0],$Ni,${N0}[0]
veor $temp,$temp,$temp
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vzip.16 $Bi,$temp
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
___
push(@ACC,shift(@ACC)); $i++;
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]!
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
add $bnptr,sp,#8 @ rewind
___
push(@ACC,shift(@ACC));
$code.=<<___;
sub $inner,$num,#8
b .LNEON_8n_inner
.align 4
.LNEON_8n_inner:
subs $inner,$inner,#8
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[3],$Bi,${A1}[1]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vst1.64 {@ACC[0]},[$toutptr,:128]!
___
push(@ACC,shift(@ACC));
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
it eq
subeq $aptr,$aptr,$num,lsl#2 @ rewind
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[2],$Ni,${N1}[0]
add $bnptr,sp,#8 @ rewind
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 @ACC[7],$Ni,${N3}[1]
bne .LNEON_8n_inner
___
push(@ACC,shift(@ACC));
$code.=<<___;
add $tinptr,sp,#128
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
veor q2,q2,q2 @ $N0-$N1
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
veor q3,q3,q3 @ $N2-$N3
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]},[$toutptr,:128]
subs $outer,$outer,#8
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
itt ne
subne $nptr,$nptr,$num,lsl#2 @ rewind
bne .LNEON_8n_outer
add $toutptr,sp,#128
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 $temp,@ACC[0]#lo,#16
vst1.64 {q2-q3},[sp,:256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vst1.64 {q2-q3}, [sp,:256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vst1.64 {q2-q3}, [sp,:256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
mov $inner,$num
b .LNEON_tail_entry
.align 4
.LNEON_tail:
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vshr.u64 $temp,@ACC[0]#lo,#16
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
.LNEON_tail_entry:
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
vshr.u64 $temp,@ACC[1]#lo,#16
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
vshr.u64 $temp,@ACC[1]#hi,#16
vzip.16 @ACC[1]#lo,@ACC[1]#hi
___
push(@ACC,shift(@ACC));
}
push(@ACC,shift(@ACC));
$code.=<<___;
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
subs $inner,$inner,#8
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
mov r11,sp
veor q0,q0,q0
sub r11,$bptr,r11 @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
mov sp,ip
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
s/\bret\b/bx lr/g or
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,574 @@
#! /usr/bin/env perl
# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = $ARGV[1];
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=1;
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";
sub bn_mul_add_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("maw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
&movd(&DWP(0,$r,"",0),"mm1");
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
&psrlq("mm1",32); # mm1 = carry0
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
&movd(&DWP(4,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry1
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
&add($a,32);
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
&movd(&DWP(8,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry2
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
&movd(&DWP(12,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry3
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
&movd(&DWP(16,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry4
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
&movd(&DWP(20,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry5
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
&movd(&DWP(24,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
&lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
&sub($c,8);
&jz(&label("maw_sse2_exit"));
&set_label("maw_sse2_entry");
&test($c,0xfffffff8);
&jnz(&label("maw_sse2_unrolled"));
&set_label("maw_sse2_loop",4);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm3"); # carry += r[i]
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
&set_label("maw_sse2_exit");
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
&set_label("maw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&sub("ecx",8);
&lea($a,&DWP(32,$a));
&lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
sub bn_mul_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("mw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry = 0
&set_label("mw_sse2_loop",16);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("mw_sse2_loop"));
&movd("eax","mm1"); # return carry
&emms();
&ret();
&set_label("mw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
sub bn_sqr_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("sqr_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&set_label("sqr_sse2_loop",16);
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
&emms();
&ret();
&set_label("sqr_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
sub bn_div_words
{
local($name)=@_;
&function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ecx",&wparam(2)); #
&div("ecx");
&ret();
&function_end_B($name);
}
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}

View File

@@ -0,0 +1,111 @@
#!/usr/bin/env perl
# Copyright (c) 2023, Google Inc.
# SPDX-License-Identifier: Apache-2.0
use strict;
my $flavour = shift;
my $output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/;
my $dir = $1;
my $xlate;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT = *OUT;
my ($rp, $ap, $bp, $num) = ("x0", "x1", "x2", "x3");
my ($a0, $a1, $b0, $b1, $num_pairs) = ("x4", "x5", "x6", "x7", "x8");
my $code = <<____;
#include <openssl/arm_arch.h>
.text
// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_add_words, %function
.globl bn_add_words
.align 4
bn_add_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Clear the carry flag.
cmn xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr $num_pairs, $num, #1
and $num, $num, #1
cbz $num_pairs, .Ladd_tail
.Ladd_loop:
ldp $a0, $a1, [$ap], #16
ldp $b0, $b1, [$bp], #16
sub $num_pairs, $num_pairs, #1
adcs $a0, $a0, $b0
adcs $a1, $a1, $b1
stp $a0, $a1, [$rp], #16
cbnz $num_pairs, .Ladd_loop
.Ladd_tail:
cbz $num, .Ladd_exit
ldr $a0, [$ap], #8
ldr $b0, [$bp], #8
adcs $a0, $a0, $b0
str $a0, [$rp], #8
.Ladd_exit:
cset x0, cs
ret
.cfi_endproc
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_sub_words, %function
.globl bn_sub_words
.align 4
bn_sub_words:
.cfi_startproc
AARCH64_VALID_CALL_TARGET
# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
# so we want C = 1 here.
cmp xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split $num = 2 * $num_pairs + $num. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr $num_pairs, $num, #1
and $num, $num, #1
cbz $num_pairs, .Lsub_tail
.Lsub_loop:
ldp $a0, $a1, [$ap], #16
ldp $b0, $b1, [$bp], #16
sub $num_pairs, $num_pairs, #1
sbcs $a0, $a0, $b0
sbcs $a1, $a1, $b1
stp $a0, $a1, [$rp], #16
cbnz $num_pairs, .Lsub_loop
.Lsub_tail:
cbz $num, .Lsub_exit
ldr $a0, [$ap], #8
ldr $b0, [$bp], #8
sbcs $a0, $a0, $b0
str $a0, [$rp], #8
.Lsub_exit:
cset x0, cc
ret
.cfi_endproc
.size bn_sub_words,.-bn_sub_words
____
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,298 @@
#! /usr/bin/env perl
# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = $ARGV[1];
open STDOUT,">$output";
&asm_init($ARGV[0]);
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}

View File

@@ -0,0 +1,698 @@
# Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2020, Intel Corporation. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Originally written by Sergey Kirillov and Andrey Matyukov.
# Special thanks to Ilya Albrekht for his valuable hints.
# Intel Corporation
#
# December 2020
#
# Initial release.
#
# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
#
# IceLake-Client @ 1.3GHz
# |---------+----------------------+--------------+-------------|
# | | OpenSSL 3.0.0-alpha9 | this | Unit |
# |---------+----------------------+--------------+-------------|
# | rsa2048 | 2 127 659 | 1 015 625 | cycles/sign |
# | | 611 | 1280 / +109% | sign/s |
# |---------+----------------------+--------------+-------------|
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$avx512ifma=1;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$avx512ifma = ($1>=2.26);
}
if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
$avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
}
if (!$avx512ifma && `$ENV{CC} -v 2>&1`
=~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
if ($1) {
# Apple conditions, they use a different version series, see
# https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
# clang 7.0.0 is Apple clang 10.0.1
$avx512ifma = ($ver>=10.0001)
} else {
$avx512ifma = ($ver>=7.0);
}
}
# In upstream, this is controlled by shelling out to the compiler to check
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
###############################################################################
# void rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
my $mask52 = "%rax";
my $acc0_0 = "%r9";
my $acc0_0_low = "%r9d";
my $acc0_1 = "%r15";
my $acc0_1_low = "%r15d";
my $b_ptr = "%r11";
my $iter = "%ebx";
my $zero = "%ymm0";
my $Bi = "%ymm1";
my $Yi = "%ymm2";
my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0) = ("%ymm3",map("%ymm$_",(16..19)));
my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1) = ("%ymm4",map("%ymm$_",(20..23)));
# Registers mapping for normalization.
my ($T0,$T0h,$T1,$T1h,$T2) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (25..26)));
sub amm52x20_x1() {
# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
# of data for corresponding AMM operation;
# _b_offset - offset in the |b| array pointing to the next qword digit;
my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_k0) = @_;
my $_R0_xmm = $_R0;
$_R0_xmm =~ s/%y/%x/;
$code.=<<___;
movq $_b_offset($b_ptr), %r13 # b[i]
vpbroadcastq %r13, $Bi # broadcast b[i]
movq $_data_offset($a), %rdx
mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2)
addq %r13, $_acc # acc += t0
movq %r12, %r10
adcq \$0, %r10 # t2 += CF
movq $_k0, %r13
imulq $_acc, %r13 # acc * k0
andq $mask52, %r13 # yi = (acc * k0) & mask52
vpbroadcastq %r13, $Yi # broadcast y[i]
movq $_data_offset($m), %rdx
mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1)
addq %r13, $_acc # acc += t0
adcq %r12, %r10 # t2 += (t1 + CF)
shrq \$52, $_acc
salq \$12, %r10
or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12))
vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
# Shift accumulators right by 1 qword, zero extending the highest one
valignq \$1, $_R0, $_R0h, $_R0
valignq \$1, $_R0h, $_R1, $_R0h
valignq \$1, $_R1, $_R1h, $_R1
valignq \$1, $_R1h, $_R2, $_R1h
valignq \$1, $_R2, $zero, $_R2
vmovq $_R0_xmm, %r13
addq %r13, $_acc # acc += R0[0]
vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
___
}
# Normalization routine: handles carry bits and gets bignum qwords to normalized
# 2^52 representation.
#
# Uses %r8-14,%e[bcd]x
sub amm52x20_x1_norm {
my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2) = @_;
$code.=<<___;
# Put accumulator to low qword in R0
vpbroadcastq $_acc, $T0
vpblendd \$3, $T0, $_R0, $_R0
# Extract "carries" (12 high bits) from each QW of R0..R2
# Save them to LSB of QWs in T0..T2
vpsrlq \$52, $_R0, $T0
vpsrlq \$52, $_R0h, $T0h
vpsrlq \$52, $_R1, $T1
vpsrlq \$52, $_R1h, $T1h
vpsrlq \$52, $_R2, $T2
# "Shift left" T0..T2 by 1 QW
valignq \$3, $T1h, $T2, $T2
valignq \$3, $T1, $T1h, $T1h
valignq \$3, $T0h, $T1, $T1
valignq \$3, $T0, $T0h, $T0h
valignq \$3, .Lzeros(%rip), $T0, $T0
# Drop "carries" from R0..R2 QWs
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
# Sum R0..R2 with corresponding adjusted carries
vpaddq $T0, $_R0, $_R0
vpaddq $T0h, $_R0h, $_R0h
vpaddq $T1, $_R1, $_R1
vpaddq $T1h, $_R1h, $_R1h
vpaddq $T2, $_R2, $_R2
# Now handle carry bits from this addition
# Get mask of QWs which 52-bit parts overflow...
vpcmpuq \$6, .Lmask52x4(%rip), $_R0, %k1 # OP=nle (i.e. gt)
vpcmpuq \$6, .Lmask52x4(%rip), $_R0h, %k2
vpcmpuq \$6, .Lmask52x4(%rip), $_R1, %k3
vpcmpuq \$6, .Lmask52x4(%rip), $_R1h, %k4
vpcmpuq \$6, .Lmask52x4(%rip), $_R2, %k5
kmovb %k1, %r14d # k1
kmovb %k2, %r13d # k1h
kmovb %k3, %r12d # k2
kmovb %k4, %r11d # k2h
kmovb %k5, %r10d # k3
# ...or saturated
vpcmpuq \$0, .Lmask52x4(%rip), $_R0, %k1 # OP=eq
vpcmpuq \$0, .Lmask52x4(%rip), $_R0h, %k2
vpcmpuq \$0, .Lmask52x4(%rip), $_R1, %k3
vpcmpuq \$0, .Lmask52x4(%rip), $_R1h, %k4
vpcmpuq \$0, .Lmask52x4(%rip), $_R2, %k5
kmovb %k1, %r9d # k4
kmovb %k2, %r8d # k4h
kmovb %k3, %ebx # k5
kmovb %k4, %ecx # k5h
kmovb %k5, %edx # k6
# Get mask of QWs where carries shall be propagated to.
# Merge 4-bit masks to 8-bit values to use add with carry.
shl \$4, %r13b
or %r13b, %r14b
shl \$4, %r11b
or %r11b, %r12b
add %r14b, %r14b
adc %r12b, %r12b
adc %r10b, %r10b
shl \$4, %r8b
or %r8b,%r9b
shl \$4, %cl
or %cl, %bl
add %r9b, %r14b
adc %bl, %r12b
adc %dl, %r10b
xor %r9b, %r14b
xor %bl, %r12b
xor %dl, %r10b
kmovb %r14d, %k1
shr \$4, %r14b
kmovb %r14d, %k2
kmovb %r12d, %k3
shr \$4, %r12b
kmovb %r12d, %k4
kmovb %r10d, %k5
# Add carries according to the obtained mask
vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
___
}
$code.=<<___;
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
.text
.globl rsaz_amm52x20_x1_ifma256
.type rsaz_amm52x20_x1_ifma256,\@function,5
.align 32
rsaz_amm52x20_x1_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lrsaz_amm52x20_x1_ifma256_body:
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
xorl $acc0_0_low, $acc0_0_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
# Loop over 20 digits unrolled by 4
mov \$5, $iter
.align 32
.Lloop5:
___
foreach my $idx (0..3) {
&amm52x20_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$k0);
}
$code.=<<___;
lea `4*8`($b_ptr), $b_ptr
dec $iter
jne .Lloop5
___
&amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vzeroupper
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbp
.cfi_restore %rbp
mov 40(%rsp),%rbx
.cfi_restore %rbx
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lrsaz_amm52x20_x1_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
___
$code.=<<___;
.section .rodata
.align 32
.Lmask52x4:
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.text
___
###############################################################################
# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
# const BN_ULONG a[2][20],
# const BN_ULONG b[2][20],
# const BN_ULONG m[2][20],
# const BN_ULONG k0[2]);
###############################################################################
$code.=<<___;
.text
.globl rsaz_amm52x20_x2_ifma256
.type rsaz_amm52x20_x2_ifma256,\@function,5
.align 32
rsaz_amm52x20_x2_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lrsaz_amm52x20_x2_ifma256_body:
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
vmovdqa64 $zero, $R0_1
vmovdqa64 $zero, $R0_1h
vmovdqa64 $zero, $R1_1
vmovdqa64 $zero, $R1_1h
vmovdqa64 $zero, $R2_1
xorl $acc0_0_low, $acc0_0_low
xorl $acc0_1_low, $acc0_1_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
mov \$20, $iter
.align 32
.Lloop20:
___
&amm52x20_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,"($k0)");
# 20*8 = offset of the next dimension in two-dimension array
&amm52x20_x1(20*8,20*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,"8($k0)");
$code.=<<___;
lea 8($b_ptr), $b_ptr
dec $iter
jne .Lloop20
___
&amm52x20_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0);
&amm52x20_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vmovdqu64 $R0_1, `5*32`($res)
vmovdqu64 $R0_1h, `6*32`($res)
vmovdqu64 $R1_1, `7*32`($res)
vmovdqu64 $R1_1h, `8*32`($res)
vmovdqu64 $R2_1, `9*32`($res)
vzeroupper
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbp
.cfi_restore %rbp
mov 40(%rsp),%rbx
.cfi_restore %rbx
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lrsaz_amm52x20_x2_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x20_x2_ifma256, .-rsaz_amm52x20_x2_ifma256
___
}
###############################################################################
# void extract_multiplier_2x20_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
# int red_table_idx1, int red_table_idx2);
#
###############################################################################
{
# input parameters
my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
my $t0xmm = $t0;
$t0xmm =~ s/%y/%x/;
$code.=<<___;
.text
.align 32
.globl extract_multiplier_2x20_win5
.type extract_multiplier_2x20_win5,\@abi-omnipotent
extract_multiplier_2x20_win5:
.cfi_startproc
endbranch
vmovdqa64 .Lones(%rip), $ones # broadcast ones
vpbroadcastq $red_tbl_idx1, $idx1
vpbroadcastq $red_tbl_idx2, $idx2
leaq `(1<<5)*2*20*8`($red_tbl), %rax # holds end of the tbl
# zeroing t0..n, cur_idx
vpxor $t0xmm, $t0xmm, $t0xmm
vmovdqa64 $t0, $cur_idx
___
foreach (1..9) {
$code.="vmovdqa64 $t0, $t[$_] \n";
}
$code.=<<___;
.align 32
.Lloop:
vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx)
vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx)
___
foreach (0..9) {
my $mask = $_<5?"%k1":"%k2";
$code.=<<___;
vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl
vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
___
}
$code.=<<___;
vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
addq \$`2*20*8`, $red_tbl
cmpq $red_tbl, %rax
jne .Lloop
___
# store t0..n
foreach (0..9) {
$code.="vmovdqu64 $t[$_], `${_}*32`($out) \n";
}
$code.=<<___;
ret
.cfi_endproc
.size extract_multiplier_2x20_win5, .-extract_multiplier_2x20_win5
___
$code.=<<___;
.section .rodata
.align 32
.Lones:
.quad 1,1,1,1
.Lzeros:
.quad 0,0,0,0
.text
___
}
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type rsaz_def_handler,\@abi-omnipotent
.align 16
rsaz_def_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size rsaz_def_handler,.-rsaz_def_handler
.section .pdata
.align 4
.rva .LSEH_begin_rsaz_amm52x20_x1_ifma256
.rva .LSEH_end_rsaz_amm52x20_x1_ifma256
.rva .LSEH_info_rsaz_amm52x20_x1_ifma256
.rva .LSEH_begin_rsaz_amm52x20_x2_ifma256
.rva .LSEH_end_rsaz_amm52x20_x2_ifma256
.rva .LSEH_info_rsaz_amm52x20_x2_ifma256
.section .xdata
.align 4
.LSEH_info_rsaz_amm52x20_x1_ifma256:
.byte 9,0,0,0
.rva rsaz_def_handler
.rva .Lrsaz_amm52x20_x1_ifma256_body,.Lrsaz_amm52x20_x1_ifma256_epilogue
.align 4
.LSEH_info_rsaz_amm52x20_x2_ifma256:
.byte 9,0,0,0
.rva rsaz_def_handler
.rva .Lrsaz_amm52x20_x2_ifma256_body,.Lrsaz_amm52x20_x2_ifma256_epilogue
#endif
___
} else {
$code.="#endif";
}
}}} else {{{ # fallback for old assembler
$code.=<<___;
.text
.globl rsaz_amm52x20_x1_ifma256
.globl rsaz_amm52x20_x2_ifma256
.globl extract_multiplier_2x20_win5
.type rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
rsaz_amm52x20_x1_ifma256:
rsaz_amm52x20_x2_ifma256:
extract_multiplier_2x20_win5:
.byte 0x0f,0x0b # ud2
ret
.size rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
___
}}}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,854 @@
# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Originally written by Sergey Kirillov and Andrey Matyukov
# Intel Corporation
#
# March 2021
#
# Initial release.
#
# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
#
# IceLake-Client @ 1.3GHz
# |---------+-----------------------+---------------+-------------|
# | | OpenSSL 3.0.0-alpha15 | this | Unit |
# |---------+-----------------------+---------------+-------------|
# | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign |
# | | 203.2 | 453.5 / +123% | sign/s |
# |---------+-----------------------+---------------+-------------|
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$avx512ifma=1;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$avx512ifma = ($1>=2.26);
}
if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
$avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
}
if (!$avx512ifma && `$ENV{CC} -v 2>&1`
=~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
if ($1) {
# Apple conditions, they use a different version series, see
# https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
# clang 7.0.0 is Apple clang 10.0.1
$avx512ifma = ($ver>=10.0001)
} else {
$avx512ifma = ($ver>=7.0);
}
}
# In upstream, this is controlled by shelling out to the compiler to check
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
###############################################################################
# void rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
my $mask52 = "%rax";
my $acc0_0 = "%r9";
my $acc0_0_low = "%r9d";
my $acc0_1 = "%r15";
my $acc0_1_low = "%r15d";
my $b_ptr = "%r11";
my $iter = "%ebx";
my $zero = "%ymm0";
my $Bi = "%ymm1";
my $Yi = "%ymm2";
my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
# Registers mapping for normalization
my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
sub amm52x30_x1() {
# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
# of data for corresponding AMM operation;
# _b_offset - offset in the |b| array pointing to the next qword digit;
my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
my $_R0_xmm = $_R0;
$_R0_xmm =~ s/%y/%x/;
$code.=<<___;
movq $_b_offset($b_ptr), %r13 # b[i]
vpbroadcastq %r13, $Bi # broadcast b[i]
movq $_data_offset($a), %rdx
mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2)
addq %r13, $_acc # acc += t0
movq %r12, %r10
adcq \$0, %r10 # t2 += CF
movq $_k0, %r13
imulq $_acc, %r13 # acc * k0
andq $mask52, %r13 # yi = (acc * k0) & mask52
vpbroadcastq %r13, $Yi # broadcast y[i]
movq $_data_offset($m), %rdx
mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1)
addq %r13, $_acc # acc += t0
adcq %r12, %r10 # t2 += (t1 + CF)
shrq \$52, $_acc
salq \$12, %r10
or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12))
vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
# Shift accumulators right by 1 qword, zero extending the highest one
valignq \$1, $_R0, $_R0h, $_R0
valignq \$1, $_R0h, $_R1, $_R0h
valignq \$1, $_R1, $_R1h, $_R1
valignq \$1, $_R1h, $_R2, $_R1h
valignq \$1, $_R2, $_R2h, $_R2
valignq \$1, $_R2h, $_R3, $_R2h
valignq \$1, $_R3, $_R3h, $_R3
valignq \$1, $_R3h, $zero, $_R3h
vmovq $_R0_xmm, %r13
addq %r13, $_acc # acc += R0[0]
vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
___
}
# Normalization routine: handles carry bits and gets bignum qwords to normalized
# 2^52 representation.
#
# Uses %r8-14,%e[abcd]x
sub amm52x30_x1_norm {
my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
$code.=<<___;
# Put accumulator to low qword in R0
vpbroadcastq $_acc, $T0
vpblendd \$3, $T0, $_R0, $_R0
# Extract "carries" (12 high bits) from each QW of the bignum
# Save them to LSB of QWs in T0..Tn
vpsrlq \$52, $_R0, $T0
vpsrlq \$52, $_R0h, $T0h
vpsrlq \$52, $_R1, $T1
vpsrlq \$52, $_R1h, $T1h
vpsrlq \$52, $_R2, $T2
vpsrlq \$52, $_R2h, $T2h
vpsrlq \$52, $_R3, $T3
vpsrlq \$52, $_R3h, $T3h
# "Shift left" T0..Tn by 1 QW
valignq \$3, $T3, $T3h, $T3h
valignq \$3, $T2h, $T3, $T3
valignq \$3, $T2, $T2h, $T2h
valignq \$3, $T1h, $T2, $T2
valignq \$3, $T1, $T1h, $T1h
valignq \$3, $T0h, $T1, $T1
valignq \$3, $T0, $T0h, $T0h
valignq \$3, .Lzeros(%rip), $T0, $T0
# Drop "carries" from R0..Rn QWs
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
vpandq .Lmask52x4(%rip), $_R2h, $_R2h
vpandq .Lmask52x4(%rip), $_R3, $_R3
vpandq .Lmask52x4(%rip), $_R3h, $_R3h
# Sum R0..Rn with corresponding adjusted carries
vpaddq $T0, $_R0, $_R0
vpaddq $T0h, $_R0h, $_R0h
vpaddq $T1, $_R1, $_R1
vpaddq $T1h, $_R1h, $_R1h
vpaddq $T2, $_R2, $_R2
vpaddq $T2h, $_R2h, $_R2h
vpaddq $T3, $_R3, $_R3
vpaddq $T3h, $_R3h, $_R3h
# Now handle carry bits from this addition
# Get mask of QWs whose 52-bit parts overflow
vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt)
vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2
kmovb %k1,%r14d
kmovb %k2,%r13d
shl \$4,%r13b
or %r13b,%r14b
vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2
kmovb %k1,%r13d
kmovb %k2,%r12d
shl \$4,%r12b
or %r12b,%r13b
vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2
kmovb %k1,%r12d
kmovb %k2,%r11d
shl \$4,%r11b
or %r11b,%r12b
vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2
kmovb %k1,%r11d
kmovb %k2,%r10d
shl \$4,%r10b
or %r10b,%r11b
addb %r14b,%r14b
adcb %r13b,%r13b
adcb %r12b,%r12b
adcb %r11b,%r11b
# Get mask of QWs whose 52-bit parts saturated
vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq
vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2
kmovb %k1,%r9d
kmovb %k2,%r8d
shl \$4,%r8b
or %r8b,%r9b
vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2
kmovb %k1,%r8d
kmovb %k2,%edx
shl \$4,%dl
or %dl,%r8b
vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2
kmovb %k1,%edx
kmovb %k2,%ecx
shl \$4,%cl
or %cl,%dl
vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2
kmovb %k1,%ecx
kmovb %k2,%ebx
shl \$4,%bl
or %bl,%cl
addb %r9b,%r14b
adcb %r8b,%r13b
adcb %dl,%r12b
adcb %cl,%r11b
xor %r9b,%r14b
xor %r8b,%r13b
xor %dl,%r12b
xor %cl,%r11b
kmovb %r14d,%k1
shr \$4,%r14b
kmovb %r14d,%k2
kmovb %r13d,%k3
shr \$4,%r13b
kmovb %r13d,%k4
kmovb %r12d,%k5
shr \$4,%r12b
kmovb %r12d,%k6
kmovb %r11d,%k7
vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7}
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
vpandq .Lmask52x4(%rip), $_R2h, $_R2h
vpandq .Lmask52x4(%rip), $_R3, $_R3
shr \$4,%r11b
kmovb %r11d,%k1
vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
vpandq .Lmask52x4(%rip), $_R3h, $_R3h
___
}
$code.=<<___;
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
.text
.globl rsaz_amm52x30_x1_ifma256
.type rsaz_amm52x30_x1_ifma256,\@function,5
.align 32
rsaz_amm52x30_x1_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
vmovdqa64 %xmm7, `1*16`(%rsp)
vmovdqa64 %xmm8, `2*16`(%rsp)
vmovdqa64 %xmm9, `3*16`(%rsp)
vmovdqa64 %xmm10,`4*16`(%rsp)
vmovdqa64 %xmm11,`5*16`(%rsp)
vmovdqa64 %xmm12,`6*16`(%rsp)
vmovdqa64 %xmm13,`7*16`(%rsp)
vmovdqa64 %xmm14,`8*16`(%rsp)
vmovdqa64 %xmm15,`9*16`(%rsp)
.Lrsaz_amm52x30_x1_ifma256_body:
___
$code.=<<___;
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
vmovdqa64 $zero, $R2_0h
vmovdqa64 $zero, $R3_0
vmovdqa64 $zero, $R3_0h
xorl $acc0_0_low, $acc0_0_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
# Loop over 30 digits unrolled by 4
mov \$7, $iter
.align 32
.Lloop7:
___
foreach my $idx (0..3) {
&amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
}
$code.=<<___;
lea `4*8`($b_ptr), $b_ptr
dec $iter
jne .Lloop7
___
&amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
&amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
&amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vmovdqu64 $R2_0h, `5*32`($res)
vmovdqu64 $R3_0, `6*32`($res)
vmovdqu64 $R3_0h, `7*32`($res)
vzeroupper
lea (%rsp),%rax
.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
vmovdqa64 `0*16`(%rax),%xmm6
vmovdqa64 `1*16`(%rax),%xmm7
vmovdqa64 `2*16`(%rax),%xmm8
vmovdqa64 `3*16`(%rax),%xmm9
vmovdqa64 `4*16`(%rax),%xmm10
vmovdqa64 `5*16`(%rax),%xmm11
vmovdqa64 `6*16`(%rax),%xmm12
vmovdqa64 `7*16`(%rax),%xmm13
vmovdqa64 `8*16`(%rax),%xmm14
vmovdqa64 `9*16`(%rax),%xmm15
lea 168(%rsp),%rax
___
$code.=<<___;
mov 0(%rax),%r15
.cfi_restore %r15
mov 8(%rax),%r14
.cfi_restore %r14
mov 16(%rax),%r13
.cfi_restore %r13
mov 24(%rax),%r12
.cfi_restore %r12
mov 32(%rax),%rbp
.cfi_restore %rbp
mov 40(%rax),%rbx
.cfi_restore %rbx
lea 48(%rax),%rsp # restore rsp
.cfi_def_cfa %rsp,8
.Lrsaz_amm52x30_x1_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
___
$code.=<<___;
.section .rodata
.align 32
.Lmask52x4:
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.text
___
###############################################################################
# void rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
# const BN_ULONG a[2][32],
# const BN_ULONG b[2][32],
# const BN_ULONG m[2][32],
# const BN_ULONG k0[2]);
###############################################################################
$code.=<<___;
.text
.globl rsaz_amm52x30_x2_ifma256
.type rsaz_amm52x30_x2_ifma256,\@function,5
.align 32
rsaz_amm52x30_x2_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -168(%rsp),%rsp
vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
vmovdqa64 %xmm7, `1*16`(%rsp)
vmovdqa64 %xmm8, `2*16`(%rsp)
vmovdqa64 %xmm9, `3*16`(%rsp)
vmovdqa64 %xmm10,`4*16`(%rsp)
vmovdqa64 %xmm11,`5*16`(%rsp)
vmovdqa64 %xmm12,`6*16`(%rsp)
vmovdqa64 %xmm13,`7*16`(%rsp)
vmovdqa64 %xmm14,`8*16`(%rsp)
vmovdqa64 %xmm15,`9*16`(%rsp)
.Lrsaz_amm52x30_x2_ifma256_body:
___
$code.=<<___;
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
vmovdqa64 $zero, $R2_0h
vmovdqa64 $zero, $R3_0
vmovdqa64 $zero, $R3_0h
vmovdqa64 $zero, $R0_1
vmovdqa64 $zero, $R0_1h
vmovdqa64 $zero, $R1_1
vmovdqa64 $zero, $R1_1h
vmovdqa64 $zero, $R2_1
vmovdqa64 $zero, $R2_1h
vmovdqa64 $zero, $R3_1
vmovdqa64 $zero, $R3_1h
xorl $acc0_0_low, $acc0_0_low
xorl $acc0_1_low, $acc0_1_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
mov \$30, $iter
.align 32
.Lloop30:
___
&amm52x30_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
# 32*8 = offset of the next dimension in two-dimension array
&amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
$code.=<<___;
lea 8($b_ptr), $b_ptr
dec $iter
jne .Lloop30
___
&amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
&amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vmovdqu64 $R2_0h, `5*32`($res)
vmovdqu64 $R3_0, `6*32`($res)
vmovdqu64 $R3_0h, `7*32`($res)
vmovdqu64 $R0_1, `8*32`($res)
vmovdqu64 $R0_1h, `9*32`($res)
vmovdqu64 $R1_1, `10*32`($res)
vmovdqu64 $R1_1h, `11*32`($res)
vmovdqu64 $R2_1, `12*32`($res)
vmovdqu64 $R2_1h, `13*32`($res)
vmovdqu64 $R3_1, `14*32`($res)
vmovdqu64 $R3_1h, `15*32`($res)
vzeroupper
lea (%rsp),%rax
.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
vmovdqa64 `0*16`(%rax),%xmm6
vmovdqa64 `1*16`(%rax),%xmm7
vmovdqa64 `2*16`(%rax),%xmm8
vmovdqa64 `3*16`(%rax),%xmm9
vmovdqa64 `4*16`(%rax),%xmm10
vmovdqa64 `5*16`(%rax),%xmm11
vmovdqa64 `6*16`(%rax),%xmm12
vmovdqa64 `7*16`(%rax),%xmm13
vmovdqa64 `8*16`(%rax),%xmm14
vmovdqa64 `9*16`(%rax),%xmm15
lea 168(%rsp),%rax
___
$code.=<<___;
mov 0(%rax),%r15
.cfi_restore %r15
mov 8(%rax),%r14
.cfi_restore %r14
mov 16(%rax),%r13
.cfi_restore %r13
mov 24(%rax),%r12
.cfi_restore %r12
mov 32(%rax),%rbp
.cfi_restore %rbp
mov 40(%rax),%rbx
.cfi_restore %rbx
lea 48(%rax),%rsp
.cfi_def_cfa %rsp,8
.Lrsaz_amm52x30_x2_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x30_x2_ifma256, .-rsaz_amm52x30_x2_ifma256
___
}
###############################################################################
# void extract_multiplier_2x30_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
# int red_table_idx1, int red_table_idx2);
#
###############################################################################
{
# input parameters
my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
my $t0xmm = $t0;
$t0xmm =~ s/%y/%x/;
$code.=<<___;
.text
.align 32
.globl extract_multiplier_2x30_win5
.type extract_multiplier_2x30_win5,\@abi-omnipotent
extract_multiplier_2x30_win5:
.cfi_startproc
endbranch
vmovdqa64 .Lones(%rip), $ones # broadcast ones
vpbroadcastq $red_tbl_idx1, $idx1
vpbroadcastq $red_tbl_idx2, $idx2
leaq `(1<<5)*2*32*8`($red_tbl), %rax # holds end of the tbl
# zeroing t0..n, cur_idx
vpxor $t0xmm, $t0xmm, $t0xmm
vmovdqa64 $t0, $cur_idx
___
foreach (1..15) {
$code.="vmovdqa64 $t0, $t[$_] \n";
}
$code.=<<___;
.align 32
.Lloop:
vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx)
vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx)
___
foreach (0..15) {
my $mask = $_<8?"%k1":"%k2";
$code.=<<___;
vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl
vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
___
}
$code.=<<___;
vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
addq \$`2*32*8`, $red_tbl
cmpq $red_tbl, %rax
jne .Lloop
___
# store t0..n
foreach (0..15) {
$code.="vmovdqu64 $t[$_], `${_}*32`($out) \n";
}
$code.=<<___;
ret
.cfi_endproc
.size extract_multiplier_2x30_win5, .-extract_multiplier_2x30_win5
___
$code.=<<___;
.section .rodata
.align 32
.Lones:
.quad 1,1,1,1
.Lzeros:
.quad 0,0,0,0
.text
___
}
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type rsaz_avx_handler,\@abi-omnipotent
.align 16
rsaz_avx_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
lea (%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea `48+168`(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size rsaz_avx_handler,.-rsaz_avx_handler
.section .pdata
.align 4
.rva .LSEH_begin_rsaz_amm52x30_x1_ifma256
.rva .LSEH_end_rsaz_amm52x30_x1_ifma256
.rva .LSEH_info_rsaz_amm52x30_x1_ifma256
.rva .LSEH_begin_rsaz_amm52x30_x2_ifma256
.rva .LSEH_end_rsaz_amm52x30_x2_ifma256
.rva .LSEH_info_rsaz_amm52x30_x2_ifma256
.section .xdata
.align 4
.LSEH_info_rsaz_amm52x30_x1_ifma256:
.byte 9,0,0,0
.rva rsaz_avx_handler
.rva .Lrsaz_amm52x30_x1_ifma256_body,.Lrsaz_amm52x30_x1_ifma256_epilogue
.align 4
.LSEH_info_rsaz_amm52x30_x2_ifma256:
.byte 9,0,0,0
.rva rsaz_avx_handler
.rva .Lrsaz_amm52x30_x2_ifma256_body,.Lrsaz_amm52x30_x2_ifma256_epilogue
#endif
___
} else {
$code.="#endif";
}
}}} else {{{ # fallback for old assembler
$code.=<<___;
.text
.globl rsaz_amm52x30_x1_ifma256
.globl rsaz_amm52x30_x2_ifma256
.globl extract_multiplier_2x30_win5
.type rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
rsaz_amm52x30_x1_ifma256:
rsaz_amm52x30_x2_ifma256:
extract_multiplier_2x30_win5:
.byte 0x0f,0x0b # ud2
ret
.size rsaz_amm52x30_x1_ifma256, .-rsaz_amm52x30_x1_ifma256
___
}}}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,915 @@
# Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2021, Intel Corporation. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Originally written by Sergey Kirillov and Andrey Matyukov
# Intel Corporation
#
# March 2021
#
# Initial release.
#
# Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
#
# IceLake-Client @ 1.3GHz
# |---------+-----------------------+---------------+-------------|
# | | OpenSSL 3.0.0-alpha15 | this | Unit |
# |---------+-----------------------+---------------+-------------|
# | rsa4096 | 14 301 4300 | 5 813 953 | cycles/sign |
# | | 90.9 | 223.6 / +146% | sign/s |
# |---------+-----------------------+---------------+-------------|
#
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$avx512ifma=1;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
$avx512ifma = ($1>=2.26);
}
if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
$avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
}
if (!$avx512ifma && `$ENV{CC} -v 2>&1`
=~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
if ($1) {
# Apple conditions, they use a different version series, see
# https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
# clang 7.0.0 is Apple clang 10.0.1
$avx512ifma = ($ver>=10.0001)
} else {
$avx512ifma = ($ver>=7.0);
}
}
# In upstream, this is controlled by shelling out to the compiler to check
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
for (@ARGV) { $avx512ifma = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
###############################################################################
# void rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
my $mask52 = "%rax";
my $acc0_0 = "%r9";
my $acc0_0_low = "%r9d";
my $acc0_1 = "%r15";
my $acc0_1_low = "%r15d";
my $b_ptr = "%r11";
my $iter = "%ebx";
my $zero = "%ymm0";
my $Bi = "%ymm1";
my $Yi = "%ymm2";
my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h) = map("%ymm$_",(3..12));
my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h) = map("%ymm$_",(13..22));
# Registers mapping for normalization
my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h,$T4,$T4h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (23..29)));
sub amm52x40_x1() {
# _data_offset - offset in the |a| or |m| arrays pointing to the beginning
# of data for corresponding AMM operation;
# _b_offset - offset in the |b| array pointing to the next qword digit;
my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h,$_k0) = @_;
my $_R0_xmm = $_R0;
$_R0_xmm =~ s/%y/%x/;
$code.=<<___;
movq $_b_offset($b_ptr), %r13 # b[i]
vpbroadcastq %r13, $Bi # broadcast b[i]
movq $_data_offset($a), %rdx
mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2)
addq %r13, $_acc # acc += t0
movq %r12, %r10
adcq \$0, %r10 # t2 += CF
movq $_k0, %r13
imulq $_acc, %r13 # acc * k0
andq $mask52, %r13 # yi = (acc * k0) & mask52
vpbroadcastq %r13, $Yi # broadcast y[i]
movq $_data_offset($m), %rdx
mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1)
addq %r13, $_acc # acc += t0
adcq %r12, %r10 # t2 += (t1 + CF)
shrq \$52, $_acc
salq \$12, %r10
or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12))
vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
vpmadd52luq `$_data_offset+64*4`($a), $Bi, $_R4
vpmadd52luq `$_data_offset+64*4+32`($a), $Bi, $_R4h
vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
vpmadd52luq `$_data_offset+64*4`($m), $Yi, $_R4
vpmadd52luq `$_data_offset+64*4+32`($m), $Yi, $_R4h
# Shift accumulators right by 1 qword, zero extending the highest one
valignq \$1, $_R0, $_R0h, $_R0
valignq \$1, $_R0h, $_R1, $_R0h
valignq \$1, $_R1, $_R1h, $_R1
valignq \$1, $_R1h, $_R2, $_R1h
valignq \$1, $_R2, $_R2h, $_R2
valignq \$1, $_R2h, $_R3, $_R2h
valignq \$1, $_R3, $_R3h, $_R3
valignq \$1, $_R3h, $_R4, $_R3h
valignq \$1, $_R4, $_R4h, $_R4
valignq \$1, $_R4h, $zero, $_R4h
vmovq $_R0_xmm, %r13
addq %r13, $_acc # acc += R0[0]
vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
vpmadd52huq `$_data_offset+64*4`($a), $Bi, $_R4
vpmadd52huq `$_data_offset+64*4+32`($a), $Bi, $_R4h
vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
vpmadd52huq `$_data_offset+64*4`($m), $Yi, $_R4
vpmadd52huq `$_data_offset+64*4+32`($m), $Yi, $_R4h
___
}
# Normalization routine: handles carry bits and gets bignum qwords to normalized
# 2^52 representation.
#
# Uses %r8-14,%e[abcd]x
sub amm52x40_x1_norm {
my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_R4,$_R4h) = @_;
$code.=<<___;
# Put accumulator to low qword in R0
vpbroadcastq $_acc, $T0
vpblendd \$3, $T0, $_R0, $_R0
# Extract "carries" (12 high bits) from each QW of the bignum
# Save them to LSB of QWs in T0..Tn
vpsrlq \$52, $_R0, $T0
vpsrlq \$52, $_R0h, $T0h
vpsrlq \$52, $_R1, $T1
vpsrlq \$52, $_R1h, $T1h
vpsrlq \$52, $_R2, $T2
vpsrlq \$52, $_R2h, $T2h
vpsrlq \$52, $_R3, $T3
vpsrlq \$52, $_R3h, $T3h
vpsrlq \$52, $_R4, $T4
vpsrlq \$52, $_R4h, $T4h
# "Shift left" T0..Tn by 1 QW
valignq \$3, $T4, $T4h, $T4h
valignq \$3, $T3h, $T4, $T4
valignq \$3, $T3, $T3h, $T3h
valignq \$3, $T2h, $T3, $T3
valignq \$3, $T2, $T2h, $T2h
valignq \$3, $T1h, $T2, $T2
valignq \$3, $T1, $T1h, $T1h
valignq \$3, $T0h, $T1, $T1
valignq \$3, $T0, $T0h, $T0h
valignq \$3, .Lzeros(%rip), $T0, $T0
# Drop "carries" from R0..Rn QWs
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
vpandq .Lmask52x4(%rip), $_R2h, $_R2h
vpandq .Lmask52x4(%rip), $_R3, $_R3
vpandq .Lmask52x4(%rip), $_R3h, $_R3h
vpandq .Lmask52x4(%rip), $_R4, $_R4
vpandq .Lmask52x4(%rip), $_R4h, $_R4h
# Sum R0..Rn with corresponding adjusted carries
vpaddq $T0, $_R0, $_R0
vpaddq $T0h, $_R0h, $_R0h
vpaddq $T1, $_R1, $_R1
vpaddq $T1h, $_R1h, $_R1h
vpaddq $T2, $_R2, $_R2
vpaddq $T2h, $_R2h, $_R2h
vpaddq $T3, $_R3, $_R3
vpaddq $T3h, $_R3h, $_R3h
vpaddq $T4, $_R4, $_R4
vpaddq $T4h, $_R4h, $_R4h
# Now handle carry bits from this addition
# Get mask of QWs whose 52-bit parts overflow
vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt)
vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2
kmovb %k1,%r14d
kmovb %k2,%r13d
shl \$4,%r13b
or %r13b,%r14b
vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2
kmovb %k1,%r13d
kmovb %k2,%r12d
shl \$4,%r12b
or %r12b,%r13b
vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2
kmovb %k1,%r12d
kmovb %k2,%r11d
shl \$4,%r11b
or %r11b,%r12b
vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2
kmovb %k1,%r11d
kmovb %k2,%r10d
shl \$4,%r10b
or %r10b,%r11b
vpcmpuq \$6,.Lmask52x4(%rip),${_R4},%k1
vpcmpuq \$6,.Lmask52x4(%rip),${_R4h},%k2
kmovb %k1,%r10d
kmovb %k2,%r9d
shl \$4,%r9b
or %r9b,%r10b
addb %r14b,%r14b
adcb %r13b,%r13b
adcb %r12b,%r12b
adcb %r11b,%r11b
adcb %r10b,%r10b
# Get mask of QWs whose 52-bit parts saturated
vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq
vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2
kmovb %k1,%r9d
kmovb %k2,%r8d
shl \$4,%r8b
or %r8b,%r9b
vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2
kmovb %k1,%r8d
kmovb %k2,%edx
shl \$4,%dl
or %dl,%r8b
vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2
kmovb %k1,%edx
kmovb %k2,%ecx
shl \$4,%cl
or %cl,%dl
vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2
kmovb %k1,%ecx
kmovb %k2,%ebx
shl \$4,%bl
or %bl,%cl
vpcmpuq \$0,.Lmask52x4(%rip),${_R4},%k1
vpcmpuq \$0,.Lmask52x4(%rip),${_R4h},%k2
kmovb %k1,%ebx
kmovb %k2,%eax
shl \$4,%al
or %al,%bl
addb %r9b,%r14b
adcb %r8b,%r13b
adcb %dl,%r12b
adcb %cl,%r11b
adcb %bl,%r10b
xor %r9b,%r14b
xor %r8b,%r13b
xor %dl,%r12b
xor %cl,%r11b
xor %bl,%r10b
kmovb %r14d,%k1
shr \$4,%r14b
kmovb %r14d,%k2
kmovb %r13d,%k3
shr \$4,%r13b
kmovb %r13d,%k4
kmovb %r12d,%k5
shr \$4,%r12b
kmovb %r12d,%k6
kmovb %r11d,%k7
vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7}
vpandq .Lmask52x4(%rip), $_R0, $_R0
vpandq .Lmask52x4(%rip), $_R0h, $_R0h
vpandq .Lmask52x4(%rip), $_R1, $_R1
vpandq .Lmask52x4(%rip), $_R1h, $_R1h
vpandq .Lmask52x4(%rip), $_R2, $_R2
vpandq .Lmask52x4(%rip), $_R2h, $_R2h
vpandq .Lmask52x4(%rip), $_R3, $_R3
shr \$4,%r11b
kmovb %r11d,%k1
kmovb %r10d,%k2
shr \$4,%r10b
kmovb %r10d,%k3
vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
vpsubq .Lmask52x4(%rip), $_R4, ${_R4}{%k2}
vpsubq .Lmask52x4(%rip), $_R4h, ${_R4h}{%k3}
vpandq .Lmask52x4(%rip), $_R3h, $_R3h
vpandq .Lmask52x4(%rip), $_R4, $_R4
vpandq .Lmask52x4(%rip), $_R4h, $_R4h
___
}
$code.=<<___;
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
.text
.globl rsaz_amm52x40_x1_ifma256
.type rsaz_amm52x40_x1_ifma256,\@function,5
.align 32
rsaz_amm52x40_x1_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
vmovdqa64 %xmm7, `1*16`(%rsp)
vmovdqa64 %xmm8, `2*16`(%rsp)
vmovdqa64 %xmm9, `3*16`(%rsp)
vmovdqa64 %xmm10,`4*16`(%rsp)
vmovdqa64 %xmm11,`5*16`(%rsp)
vmovdqa64 %xmm12,`6*16`(%rsp)
vmovdqa64 %xmm13,`7*16`(%rsp)
vmovdqa64 %xmm14,`8*16`(%rsp)
vmovdqa64 %xmm15,`9*16`(%rsp)
.Lrsaz_amm52x40_x1_ifma256_body:
___
$code.=<<___;
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
vmovdqa64 $zero, $R2_0h
vmovdqa64 $zero, $R3_0
vmovdqa64 $zero, $R3_0h
vmovdqa64 $zero, $R4_0
vmovdqa64 $zero, $R4_0h
xorl $acc0_0_low, $acc0_0_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
# Loop over 40 digits unrolled by 4
mov \$10, $iter
.align 32
.Lloop10:
___
foreach my $idx (0..3) {
&amm52x40_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,$k0);
}
$code.=<<___;
lea `4*8`($b_ptr), $b_ptr
dec $iter
jne .Lloop10
___
&amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vmovdqu64 $R2_0h, `5*32`($res)
vmovdqu64 $R3_0, `6*32`($res)
vmovdqu64 $R3_0h, `7*32`($res)
vmovdqu64 $R4_0, `8*32`($res)
vmovdqu64 $R4_0h, `9*32`($res)
vzeroupper
lea (%rsp),%rax
.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
vmovdqa64 `0*16`(%rax),%xmm6
vmovdqa64 `1*16`(%rax),%xmm7
vmovdqa64 `2*16`(%rax),%xmm8
vmovdqa64 `3*16`(%rax),%xmm9
vmovdqa64 `4*16`(%rax),%xmm10
vmovdqa64 `5*16`(%rax),%xmm11
vmovdqa64 `6*16`(%rax),%xmm12
vmovdqa64 `7*16`(%rax),%xmm13
vmovdqa64 `8*16`(%rax),%xmm14
vmovdqa64 `9*16`(%rax),%xmm15
lea 168(%rsp),%rax
___
$code.=<<___;
mov 0(%rax),%r15
.cfi_restore %r15
mov 8(%rax),%r14
.cfi_restore %r14
mov 16(%rax),%r13
.cfi_restore %r13
mov 24(%rax),%r12
.cfi_restore %r12
mov 32(%rax),%rbp
.cfi_restore %rbp
mov 40(%rax),%rbx
.cfi_restore %rbx
lea 48(%rax),%rsp # restore rsp
.cfi_def_cfa %rsp,8
.Lrsaz_amm52x40_x1_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
___
$code.=<<___;
.section .rodata
.align 32
.Lmask52x4:
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.quad 0xfffffffffffff
.text
___
###############################################################################
# void rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
# const BN_ULONG a[2][40],
# const BN_ULONG b[2][40],
# const BN_ULONG m[2][40],
# const BN_ULONG k0[2]);
###############################################################################
$code.=<<___;
.text
.globl rsaz_amm52x40_x2_ifma256
.type rsaz_amm52x40_x2_ifma256,\@function,5
.align 32
rsaz_amm52x40_x2_ifma256:
.cfi_startproc
endbranch
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
___
$code.=<<___ if ($win64);
lea -168(%rsp),%rsp
vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
vmovdqa64 %xmm7, `1*16`(%rsp)
vmovdqa64 %xmm8, `2*16`(%rsp)
vmovdqa64 %xmm9, `3*16`(%rsp)
vmovdqa64 %xmm10,`4*16`(%rsp)
vmovdqa64 %xmm11,`5*16`(%rsp)
vmovdqa64 %xmm12,`6*16`(%rsp)
vmovdqa64 %xmm13,`7*16`(%rsp)
vmovdqa64 %xmm14,`8*16`(%rsp)
vmovdqa64 %xmm15,`9*16`(%rsp)
.Lrsaz_amm52x40_x2_ifma256_body:
___
$code.=<<___;
# Zeroing accumulators
vpxord $zero, $zero, $zero
vmovdqa64 $zero, $R0_0
vmovdqa64 $zero, $R0_0h
vmovdqa64 $zero, $R1_0
vmovdqa64 $zero, $R1_0h
vmovdqa64 $zero, $R2_0
vmovdqa64 $zero, $R2_0h
vmovdqa64 $zero, $R3_0
vmovdqa64 $zero, $R3_0h
vmovdqa64 $zero, $R4_0
vmovdqa64 $zero, $R4_0h
vmovdqa64 $zero, $R0_1
vmovdqa64 $zero, $R0_1h
vmovdqa64 $zero, $R1_1
vmovdqa64 $zero, $R1_1h
vmovdqa64 $zero, $R2_1
vmovdqa64 $zero, $R2_1h
vmovdqa64 $zero, $R3_1
vmovdqa64 $zero, $R3_1h
vmovdqa64 $zero, $R4_1
vmovdqa64 $zero, $R4_1h
xorl $acc0_0_low, $acc0_0_low
xorl $acc0_1_low, $acc0_1_low
movq $b, $b_ptr # backup address of b
movq \$0xfffffffffffff, $mask52 # 52-bit mask
mov \$40, $iter
.align 32
.Lloop40:
___
&amm52x40_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h,"($k0)");
# 40*8 = offset of the next dimension in two-dimension array
&amm52x40_x1(40*8,40*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h,"8($k0)");
$code.=<<___;
lea 8($b_ptr), $b_ptr
dec $iter
jne .Lloop40
___
&amm52x40_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$R4_0,$R4_0h);
&amm52x40_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,$R4_1,$R4_1h);
$code.=<<___;
vmovdqu64 $R0_0, `0*32`($res)
vmovdqu64 $R0_0h, `1*32`($res)
vmovdqu64 $R1_0, `2*32`($res)
vmovdqu64 $R1_0h, `3*32`($res)
vmovdqu64 $R2_0, `4*32`($res)
vmovdqu64 $R2_0h, `5*32`($res)
vmovdqu64 $R3_0, `6*32`($res)
vmovdqu64 $R3_0h, `7*32`($res)
vmovdqu64 $R4_0, `8*32`($res)
vmovdqu64 $R4_0h, `9*32`($res)
vmovdqu64 $R0_1, `10*32`($res)
vmovdqu64 $R0_1h, `11*32`($res)
vmovdqu64 $R1_1, `12*32`($res)
vmovdqu64 $R1_1h, `13*32`($res)
vmovdqu64 $R2_1, `14*32`($res)
vmovdqu64 $R2_1h, `15*32`($res)
vmovdqu64 $R3_1, `16*32`($res)
vmovdqu64 $R3_1h, `17*32`($res)
vmovdqu64 $R4_1, `18*32`($res)
vmovdqu64 $R4_1h, `19*32`($res)
vzeroupper
lea (%rsp),%rax
.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
vmovdqa64 `0*16`(%rax),%xmm6
vmovdqa64 `1*16`(%rax),%xmm7
vmovdqa64 `2*16`(%rax),%xmm8
vmovdqa64 `3*16`(%rax),%xmm9
vmovdqa64 `4*16`(%rax),%xmm10
vmovdqa64 `5*16`(%rax),%xmm11
vmovdqa64 `6*16`(%rax),%xmm12
vmovdqa64 `7*16`(%rax),%xmm13
vmovdqa64 `8*16`(%rax),%xmm14
vmovdqa64 `9*16`(%rax),%xmm15
lea 168(%rsp),%rax
___
$code.=<<___;
mov 0(%rax),%r15
.cfi_restore %r15
mov 8(%rax),%r14
.cfi_restore %r14
mov 16(%rax),%r13
.cfi_restore %r13
mov 24(%rax),%r12
.cfi_restore %r12
mov 32(%rax),%rbp
.cfi_restore %rbp
mov 40(%rax),%rbx
.cfi_restore %rbx
lea 48(%rax),%rsp
.cfi_def_cfa %rsp,8
.Lrsaz_amm52x40_x2_ifma256_epilogue:
ret
.cfi_endproc
.size rsaz_amm52x40_x2_ifma256, .-rsaz_amm52x40_x2_ifma256
___
}
###############################################################################
# void extract_multiplier_2x40_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
# int red_table_idx1, int red_table_idx2);
#
###############################################################################
{
# input parameters
my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
my ($t6,$t7,$t8,$t9) = map("%ymm$_", (16..19));
my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (20..24));
my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9);
my $t0xmm = $t0;
$t0xmm =~ s/%y/%x/;
sub get_table_value_consttime() {
my ($_idx,$_offset) = @_;
$code.=<<___;
vpxorq $cur_idx, $cur_idx, $cur_idx
.align 32
.Lloop_$_offset:
vpcmpq \$0, $cur_idx, $_idx, %k1 # mask of (idx == cur_idx)
___
foreach (0..9) {
$code.=<<___;
vmovdqu64 `$_offset+${_}*32`($red_tbl), $tmp # load data from red_tbl
vpblendmq $tmp, $t[$_], ${t[$_]}{%k1} # extract data when mask is not zero
___
}
$code.=<<___;
vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
addq \$`2*40*8`, $red_tbl
cmpq $red_tbl, %rax
jne .Lloop_$_offset
___
}
$code.=<<___;
.text
.align 32
.globl extract_multiplier_2x40_win5
.type extract_multiplier_2x40_win5,\@abi-omnipotent
extract_multiplier_2x40_win5:
.cfi_startproc
endbranch
vmovdqa64 .Lones(%rip), $ones # broadcast ones
vpbroadcastq $red_tbl_idx1, $idx1
vpbroadcastq $red_tbl_idx2, $idx2
leaq `(1<<5)*2*40*8`($red_tbl), %rax # holds end of the tbl
# backup red_tbl address
movq $red_tbl, %r10
# zeroing t0..n, cur_idx
vpxor $t0xmm, $t0xmm, $t0xmm
___
foreach (1..9) {
$code.="vmovdqa64 $t0, $t[$_] \n";
}
&get_table_value_consttime($idx1, 0);
foreach (0..9) {
$code.="vmovdqu64 $t[$_], `(0+$_)*32`($out) \n";
}
$code.="movq %r10, $red_tbl \n";
&get_table_value_consttime($idx2, 40*8);
foreach (0..9) {
$code.="vmovdqu64 $t[$_], `(10+$_)*32`($out) \n";
}
$code.=<<___;
ret
.cfi_endproc
.size extract_multiplier_2x40_win5, .-extract_multiplier_2x40_win5
___
$code.=<<___;
.section .rodata
.align 32
.Lones:
.quad 1,1,1,1
.Lzeros:
.quad 0,0,0,0
.text
___
}
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type rsaz_avx_handler,\@abi-omnipotent
.align 16
rsaz_avx_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
lea (%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea `48+168`(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size rsaz_avx_handler,.-rsaz_avx_handler
.section .pdata
.align 4
.rva .LSEH_begin_rsaz_amm52x40_x1_ifma256
.rva .LSEH_end_rsaz_amm52x40_x1_ifma256
.rva .LSEH_info_rsaz_amm52x40_x1_ifma256
.rva .LSEH_begin_rsaz_amm52x40_x2_ifma256
.rva .LSEH_end_rsaz_amm52x40_x2_ifma256
.rva .LSEH_info_rsaz_amm52x40_x2_ifma256
.section .xdata
.align 4
.LSEH_info_rsaz_amm52x40_x1_ifma256:
.byte 9,0,0,0
.rva rsaz_avx_handler
.rva .Lrsaz_amm52x40_x1_ifma256_body,.Lrsaz_amm52x40_x1_ifma256_epilogue
.align 4
.LSEH_info_rsaz_amm52x40_x2_ifma256:
.byte 9,0,0,0
.rva rsaz_avx_handler
.rva .Lrsaz_amm52x40_x2_ifma256_body,.Lrsaz_amm52x40_x2_ifma256_epilogue
#endif
___
} else {
$code.="#endif";
}
}}} else {{{ # fallback for old assembler
$code.=<<___;
.text
.globl rsaz_amm52x40_x1_ifma256
.globl rsaz_amm52x40_x2_ifma256
.globl extract_multiplier_2x40_win5
.type rsaz_amm52x40_x1_ifma256,\@abi-omnipotent
rsaz_amm52x40_x1_ifma256:
rsaz_amm52x40_x2_ifma256:
extract_multiplier_2x40_win5:
.byte 0x0f,0x0b # ud2
ret
.size rsaz_amm52x40_x1_ifma256, .-rsaz_amm52x40_x1_ifma256
___
}}}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,628 @@
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../../perlasm");
require "x86asm.pl";
$output = $ARGV[1];
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=1;
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arranging 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("ebp","edx"); # this splits them apart modulo 4096
&and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","esp");
&sub ("eax","ebp");
&and ("eax",-4096);
&mov ("edx","esp"); # saved stack pointer!
&lea ("esp",&DWP(0,"ebp","eax"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&jmp (&label("page_walk_done"));
&set_label("page_walk",16);
&lea ("esp",&DWP(-4096,"esp"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengths on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&mov ("edx",-1);
&xor ("edx","eax");
&jmp (&label("copy"));
&set_label("copy",16); # conditional copy
&mov ($tp,&DWP($frame,"esp",$num,4));
&mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&and ($tp,"eax");
&and ($np,"edx");
&or ($np,$tp);
&mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,534 @@
/* x86_64 BIGNUM accelerator version 0.1, December 2002.
* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL project.
* SPDX-License-Identifier: Apache-2.0
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there might be enough room for further improvement.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI which I'm not familiar with. This is
* why I decided to let the compiler take care of subroutine
* prologue/epilogue as well as register allocation. For reference.
* Win64 implements different ABI for AMD64, different from Linux.
*
* Q. How much faster does it get?
* A. 'apps/openssl speed rsa dsa' output with no-asm:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
* sign verify sign/s verify/s
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
*
* 'apps/openssl speed rsa dsa' output with this module:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
* sign verify sign/s verify/s
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
*
* For the reference. IA-32 assembler implementation performs
* very much like 64-bit code compiled with no-asm on the same
* machine.
*/
#include <openssl/bn.h>
// TODO(davidben): Get this file working on MSVC x64.
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
(defined(__GNUC__) || defined(__clang__))
#include "../internal.h"
#undef mul
#undef mul_add
// "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
#define mul_add(r, a, word, carry) \
do { \
register BN_ULONG high, low; \
__asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
__asm__("addq %2,%0; adcq $0,%1" \
: "+r"(carry), "+d"(high) \
: "a"(low) \
: "cc"); \
__asm__("addq %2,%0; adcq $0,%1" \
: "+m"(r), "+d"(high) \
: "r"(carry) \
: "cc"); \
(carry) = high; \
} while (0)
#define mul(r, a, word, carry) \
do { \
register BN_ULONG high, low; \
__asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
__asm__("addq %2,%0; adcq $0,%1" \
: "+r"(carry), "+d"(high) \
: "a"(low) \
: "cc"); \
(r) = (carry); \
(carry) = high; \
} while (0)
#undef sqr
#define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w) {
BN_ULONG c1 = 0;
if (num == 0) {
return (c1);
}
while (num & ~3) {
mul_add(rp[0], ap[0], w, c1);
mul_add(rp[1], ap[1], w, c1);
mul_add(rp[2], ap[2], w, c1);
mul_add(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul_add(rp[0], ap[0], w, c1);
if (--num == 0) {
return c1;
}
mul_add(rp[1], ap[1], w, c1);
if (--num == 0) {
return c1;
}
mul_add(rp[2], ap[2], w, c1);
return c1;
}
return c1;
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w) {
BN_ULONG c1 = 0;
if (num == 0) {
return c1;
}
while (num & ~3) {
mul(rp[0], ap[0], w, c1);
mul(rp[1], ap[1], w, c1);
mul(rp[2], ap[2], w, c1);
mul(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul(rp[0], ap[0], w, c1);
if (--num == 0) {
return c1;
}
mul(rp[1], ap[1], w, c1);
if (--num == 0) {
return c1;
}
mul(rp[2], ap[2], w, c1);
}
return c1;
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
if (n == 0) {
return;
}
while (n & ~3) {
sqr(r[0], r[1], a[0]);
sqr(r[2], r[3], a[1]);
sqr(r[4], r[5], a[2]);
sqr(r[6], r[7], a[3]);
a += 4;
r += 8;
n -= 4;
}
if (n) {
sqr(r[0], r[1], a[0]);
if (--n == 0) {
return;
}
sqr(r[2], r[3], a[1]);
if (--n == 0) {
return;
}
sqr(r[4], r[5], a[2]);
}
}
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t n) {
BN_ULONG ret;
size_t i = 0;
if (n == 0) {
return 0;
}
__asm__ volatile (
" subq %0,%0 \n" // clear carry
" jmp 1f \n"
".p2align 4 \n"
"1:"
" movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" dec %1 \n"
" jnz 1b \n"
" sbbq %0,%0 \n"
: "=&r"(ret), "+&c"(n), "+&r"(i)
: "r"(rp), "r"(ap), "r"(bp)
: "cc", "memory");
return ret & 1;
}
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t n) {
BN_ULONG ret;
size_t i = 0;
if (n == 0) {
return 0;
}
__asm__ volatile (
" subq %0,%0 \n" // clear borrow
" jmp 1f \n"
".p2align 4 \n"
"1:"
" movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" dec %1 \n"
" jnz 1b \n"
" sbbq %0,%0 \n"
: "=&r"(ret), "+&c"(n), "+&r"(i)
: "r"(rp), "r"(ap), "r"(bp)
: "cc", "memory");
return ret & 1;
}
// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0)
// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0)
// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
// Keep in mind that carrying into high part of multiplication result can not
// overflow, because it cannot be all-ones.
#define mul_add_c(a, b, c0, c1, c2) \
do { \
BN_ULONG t1, t2; \
__asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
__asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \
: "+&r"(c0), "+r"(c1), "+r"(c2) \
: "r"(t1), "r"(t2) \
: "cc"); \
} while (0)
#define sqr_add_c(a, i, c0, c1, c2) \
do { \
BN_ULONG t1, t2; \
__asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \
__asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \
: "+&r"(c0), "+r"(c1), "+r"(c2) \
: "r"(t1), "r"(t2) \
: "cc"); \
} while (0)
#define mul_add_c2(a, b, c0, c1, c2) \
do { \
BN_ULONG t1, t2; \
__asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
__asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \
: "+&r"(c0), "+r"(c1), "+r"(c2) \
: "r"(t1), "r"(t2) \
: "cc"); \
__asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \
: "+&r"(c0), "+r"(c1), "+r"(c2) \
: "r"(t1), "r"(t2) \
: "cc"); \
} while (0)
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[4], b[0], c2, c3, c1);
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
mul_add_c(a[0], b[4], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[0], b[5], c3, c1, c2);
mul_add_c(a[1], b[4], c3, c1, c2);
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
mul_add_c(a[4], b[1], c3, c1, c2);
mul_add_c(a[5], b[0], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[6], b[0], c1, c2, c3);
mul_add_c(a[5], b[1], c1, c2, c3);
mul_add_c(a[4], b[2], c1, c2, c3);
mul_add_c(a[3], b[3], c1, c2, c3);
mul_add_c(a[2], b[4], c1, c2, c3);
mul_add_c(a[1], b[5], c1, c2, c3);
mul_add_c(a[0], b[6], c1, c2, c3);
r[6] = c1;
c1 = 0;
mul_add_c(a[0], b[7], c2, c3, c1);
mul_add_c(a[1], b[6], c2, c3, c1);
mul_add_c(a[2], b[5], c2, c3, c1);
mul_add_c(a[3], b[4], c2, c3, c1);
mul_add_c(a[4], b[3], c2, c3, c1);
mul_add_c(a[5], b[2], c2, c3, c1);
mul_add_c(a[6], b[1], c2, c3, c1);
mul_add_c(a[7], b[0], c2, c3, c1);
r[7] = c2;
c2 = 0;
mul_add_c(a[7], b[1], c3, c1, c2);
mul_add_c(a[6], b[2], c3, c1, c2);
mul_add_c(a[5], b[3], c3, c1, c2);
mul_add_c(a[4], b[4], c3, c1, c2);
mul_add_c(a[3], b[5], c3, c1, c2);
mul_add_c(a[2], b[6], c3, c1, c2);
mul_add_c(a[1], b[7], c3, c1, c2);
r[8] = c3;
c3 = 0;
mul_add_c(a[2], b[7], c1, c2, c3);
mul_add_c(a[3], b[6], c1, c2, c3);
mul_add_c(a[4], b[5], c1, c2, c3);
mul_add_c(a[5], b[4], c1, c2, c3);
mul_add_c(a[6], b[3], c1, c2, c3);
mul_add_c(a[7], b[2], c1, c2, c3);
r[9] = c1;
c1 = 0;
mul_add_c(a[7], b[3], c2, c3, c1);
mul_add_c(a[6], b[4], c2, c3, c1);
mul_add_c(a[5], b[5], c2, c3, c1);
mul_add_c(a[4], b[6], c2, c3, c1);
mul_add_c(a[3], b[7], c2, c3, c1);
r[10] = c2;
c2 = 0;
mul_add_c(a[4], b[7], c3, c1, c2);
mul_add_c(a[5], b[6], c3, c1, c2);
mul_add_c(a[6], b[5], c3, c1, c2);
mul_add_c(a[7], b[4], c3, c1, c2);
r[11] = c3;
c3 = 0;
mul_add_c(a[7], b[5], c1, c2, c3);
mul_add_c(a[6], b[6], c1, c2, c3);
mul_add_c(a[5], b[7], c1, c2, c3);
r[12] = c1;
c1 = 0;
mul_add_c(a[6], b[7], c2, c3, c1);
mul_add_c(a[7], b[6], c2, c3, c1);
r[13] = c2;
c2 = 0;
mul_add_c(a[7], b[7], c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[3], b[3], c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
sqr_add_c2(a, 4, 0, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 5, 0, c3, c1, c2);
sqr_add_c2(a, 4, 1, c3, c1, c2);
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
sqr_add_c2(a, 4, 2, c1, c2, c3);
sqr_add_c2(a, 5, 1, c1, c2, c3);
sqr_add_c2(a, 6, 0, c1, c2, c3);
r[6] = c1;
c1 = 0;
sqr_add_c2(a, 7, 0, c2, c3, c1);
sqr_add_c2(a, 6, 1, c2, c3, c1);
sqr_add_c2(a, 5, 2, c2, c3, c1);
sqr_add_c2(a, 4, 3, c2, c3, c1);
r[7] = c2;
c2 = 0;
sqr_add_c(a, 4, c3, c1, c2);
sqr_add_c2(a, 5, 3, c3, c1, c2);
sqr_add_c2(a, 6, 2, c3, c1, c2);
sqr_add_c2(a, 7, 1, c3, c1, c2);
r[8] = c3;
c3 = 0;
sqr_add_c2(a, 7, 2, c1, c2, c3);
sqr_add_c2(a, 6, 3, c1, c2, c3);
sqr_add_c2(a, 5, 4, c1, c2, c3);
r[9] = c1;
c1 = 0;
sqr_add_c(a, 5, c2, c3, c1);
sqr_add_c2(a, 6, 4, c2, c3, c1);
sqr_add_c2(a, 7, 3, c2, c3, c1);
r[10] = c2;
c2 = 0;
sqr_add_c2(a, 7, 4, c3, c1, c2);
sqr_add_c2(a, 6, 5, c3, c1, c2);
r[11] = c3;
c3 = 0;
sqr_add_c(a, 6, c1, c2, c3);
sqr_add_c2(a, 7, 5, c1, c2, c3);
r[12] = c1;
c1 = 0;
sqr_add_c2(a, 7, 6, c2, c3, c1);
r[13] = c2;
c2 = 0;
sqr_add_c(a, 7, c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
#undef mul_add
#undef mul
#undef sqr
#undef mul_add_c
#undef sqr_add_c
#undef mul_add_c2
#undef sqr_add_c2
#endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,407 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <limits.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include "internal.h"
#include "../delocate.h"
// BN_MAX_WORDS is the maximum number of words allowed in a |BIGNUM|. It is
// sized so byte and bit counts of a |BIGNUM| always fit in |int|, with room to
// spare.
#define BN_MAX_WORDS (INT_MAX / (4 * BN_BITS2))
BIGNUM *BN_new(void) {
BIGNUM *bn = OPENSSL_zalloc(sizeof(BIGNUM));
if (bn == NULL) {
return NULL;
}
bn->flags = BN_FLG_MALLOCED;
return bn;
}
BIGNUM *BN_secure_new(void) { return BN_new(); }
void BN_init(BIGNUM *bn) {
OPENSSL_memset(bn, 0, sizeof(BIGNUM));
}
void BN_free(BIGNUM *bn) {
if (bn == NULL) {
return;
}
if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
OPENSSL_free(bn->d);
}
if (bn->flags & BN_FLG_MALLOCED) {
OPENSSL_free(bn);
} else {
bn->d = NULL;
}
}
void BN_clear_free(BIGNUM *bn) {
BN_free(bn);
}
BIGNUM *BN_dup(const BIGNUM *src) {
BIGNUM *copy;
if (src == NULL) {
return NULL;
}
copy = BN_new();
if (copy == NULL) {
return NULL;
}
if (!BN_copy(copy, src)) {
BN_free(copy);
return NULL;
}
return copy;
}
BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) {
if (src == dest) {
return dest;
}
if (!bn_wexpand(dest, src->width)) {
return NULL;
}
OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->width);
dest->width = src->width;
dest->neg = src->neg;
return dest;
}
void BN_clear(BIGNUM *bn) {
if (bn->d != NULL) {
OPENSSL_memset(bn->d, 0, bn->dmax * sizeof(bn->d[0]));
}
bn->width = 0;
bn->neg = 0;
}
DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) {
static const BN_ULONG kOneLimbs[1] = { 1 };
out->d = (BN_ULONG*) kOneLimbs;
out->width = 1;
out->dmax = 1;
out->neg = 0;
out->flags = BN_FLG_STATIC_DATA;
}
// BN_num_bits_word returns the minimum number of bits needed to represent the
// value in |l|.
unsigned BN_num_bits_word(BN_ULONG l) {
// |BN_num_bits| is often called on RSA prime factors. These have public bit
// lengths, but all bits beyond the high bit are secret, so count bits in
// constant time.
BN_ULONG x, mask;
int bits = (l != 0);
#if BN_BITS2 > 32
// Look at the upper half of |x|. |x| is at most 64 bits long.
x = l >> 32;
// Set |mask| to all ones if |x| (the top 32 bits of |l|) is non-zero and all
// all zeros otherwise.
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
// If |x| is non-zero, the lower half is included in the bit count in full,
// and we count the upper half. Otherwise, we count the lower half.
bits += 32 & mask;
l ^= (x ^ l) & mask; // |l| is |x| if |mask| and remains |l| otherwise.
#endif
// The remaining blocks are analogous iterations at lower powers of two.
x = l >> 16;
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
bits += 16 & mask;
l ^= (x ^ l) & mask;
x = l >> 8;
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
bits += 8 & mask;
l ^= (x ^ l) & mask;
x = l >> 4;
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
bits += 4 & mask;
l ^= (x ^ l) & mask;
x = l >> 2;
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
bits += 2 & mask;
l ^= (x ^ l) & mask;
x = l >> 1;
mask = 0u - x;
mask = (0u - (mask >> (BN_BITS2 - 1)));
bits += 1 & mask;
return bits;
}
// |BN_num_bits| and |BN_num_bytes| return ints in OpenSSL. In theory, a swap
// from OpenSSL to AWS-LC is not type-safe. However, in practice, the bit length
// should be representable by a int due to |BN_MAX_WORDS|. The maximum bit size
// of a BIGNUM is right-shifted by 2. If bit-sizes can be
// represented by an int, so the byte/word size.
unsigned BN_num_bits(const BIGNUM *bn) {
const int width = bn_minimal_width(bn);
if (width == 0) {
return 0;
}
return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]);
}
unsigned BN_num_bytes(const BIGNUM *bn) {
return (BN_num_bits(bn) + 7) / 8;
}
// ibmtpm performs a direct check of output to a signed value. This won't work
// if below returns an unsigned value. Hence, the int return type.
int BN_get_minimal_width(const BIGNUM *bn) {
return bn_minimal_width(bn);
}
void BN_zero(BIGNUM *bn) {
bn->width = bn->neg = 0;
}
int BN_one(BIGNUM *bn) {
return BN_set_word(bn, 1);
}
int BN_set_word(BIGNUM *bn, BN_ULONG value) {
if (value == 0) {
BN_zero(bn);
return 1;
}
if (!bn_wexpand(bn, 1)) {
return 0;
}
bn->neg = 0;
bn->d[0] = value;
bn->width = 1;
return 1;
}
int BN_set_u64(BIGNUM *bn, uint64_t value) {
#if BN_BITS2 == 64
return BN_set_word(bn, value);
#elif BN_BITS2 == 32
if (value <= BN_MASK2) {
return BN_set_word(bn, (BN_ULONG)value);
}
if (!bn_wexpand(bn, 2)) {
return 0;
}
bn->neg = 0;
bn->d[0] = (BN_ULONG)value;
bn->d[1] = (BN_ULONG)(value >> 32);
bn->width = 2;
return 1;
#else
#error "BN_BITS2 must be 32 or 64."
#endif
}
int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
if (!bn_wexpand(bn, num)) {
return 0;
}
OPENSSL_memmove(bn->d, words, num * sizeof(BN_ULONG));
// |bn_wexpand| verified that |num| isn't too large.
bn->width = (int)num;
bn->neg = 0;
return 1;
}
void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num) {
if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
OPENSSL_free(bn->d);
}
bn->d = (BN_ULONG *)words;
assert(num <= BN_MAX_WORDS);
bn->width = (int)num;
bn->dmax = (int)num;
bn->neg = 0;
bn->flags |= BN_FLG_STATIC_DATA;
}
int bn_fits_in_words(const BIGNUM *bn, size_t num) {
// All words beyond |num| must be zero.
BN_ULONG mask = 0;
for (size_t i = num; i < (size_t)bn->width; i++) {
mask |= bn->d[i];
}
return mask == 0;
}
int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) {
if (bn->neg) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
size_t width = (size_t)bn->width;
if (width > num) {
if (!bn_fits_in_words(bn, num)) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
width = num;
}
OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num);
OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width);
return 1;
}
int BN_is_negative(const BIGNUM *bn) {
return bn->neg != 0;
}
void BN_set_negative(BIGNUM *bn, int sign) {
if (sign && !BN_is_zero(bn)) {
bn->neg = 1;
} else {
bn->neg = 0;
}
}
int bn_wexpand(BIGNUM *bn, size_t words) {
BN_ULONG *a;
if (words <= (size_t)bn->dmax) {
return 1;
}
if (words > BN_MAX_WORDS) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
if (bn->flags & BN_FLG_STATIC_DATA) {
OPENSSL_PUT_ERROR(BN, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
return 0;
}
a = OPENSSL_calloc(words, sizeof(BN_ULONG));
if (a == NULL) {
return 0;
}
OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->width);
OPENSSL_free(bn->d);
bn->d = a;
bn->dmax = (int)words;
return 1;
}
int bn_expand(BIGNUM *bn, size_t bits) {
if (bits + BN_BITS2 - 1 < bits) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
}
int bn_resize_words(BIGNUM *bn, size_t words) {
#if (defined(OPENSSL_PPC64LE) || defined(OPENSSL_PPC64BE)) && defined(__clang__) && __clang_major__ < 10
// This is a workaround for a miscompilation bug in Clang 7.0.1 on POWER.
// The unittests catch the miscompilation, if it occurs, and it manifests
// as a crash in |bn_fits_in_words|.
//
// The bug only triggers if building in FIPS mode and with -O3. Clang 8.0.1
// has the same bug but this workaround is not effective there---I've not
// been able to find a workaround for 8.0.1.
//
// At the time of writing (2019-08-08), Clang git does *not* have this bug
// and does not need this workaroud. The current git version should go on to
// be Clang 10 thus, once we can depend on that, this can be removed.
if (value_barrier_w((size_t)bn->width == words)) {
return 1;
}
#endif
if ((size_t)bn->width <= words) {
if (!bn_wexpand(bn, words)) {
return 0;
}
OPENSSL_memset(bn->d + bn->width, 0,
(words - bn->width) * sizeof(BN_ULONG));
bn->width = (int)words;
return 1;
}
// All words beyond the new width must be zero.
if (!bn_fits_in_words(bn, words)) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
bn->width = (int)words;
return 1;
}
void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
const BN_ULONG *b, size_t num) {
for (size_t i = 0; i < num; i++) {
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
r[i] = constant_time_select_w(mask, a[i], b[i]);
}
}
int bn_minimal_width(const BIGNUM *bn) {
int ret = bn->width;
while (ret > 0 && bn->d[ret - 1] == 0) {
ret--;
}
return ret;
}
void bn_set_minimal_width(BIGNUM *bn) {
bn->width = bn_minimal_width(bn);
if (bn->width == 0) {
bn->neg = 0;
}
}
int BN_get_flags(const BIGNUM *bn, int flags) {
return bn->flags & flags;
}
void BN_set_flags(BIGNUM *b, int n) { }

View File

@@ -0,0 +1,76 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#include <openssl/bn.h>
#include <openssl/rand.h>
#include "./internal.h"
#include <gtest/gtest.h>
TEST(BNAssertTest, Assert_fits_in_bytes_large) {
// TODO: Update Android test harness
#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
bssl::UniquePtr<BIGNUM> x(BN_new());
uint8_t input[255];
OPENSSL_memset(input, 0, sizeof(input));
input[0] = 0xaa;
input[1] = 0x01;
input[254] = 0x01;
ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
for (size_t i = 255; i < 260; i++) {
bn_assert_fits_in_bytes(x.get(), i);
}
for (size_t i = 247; i < 255; i++) {
EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
}
#endif
}
TEST(BNAssertTest, Assert_fits_in_bytes_small) {
#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
bssl::UniquePtr<BIGNUM> x(BN_new());
uint8_t input[8];
OPENSSL_memset(input, 0, sizeof(input));
input[0] = 0xaa;
input[1] = 0xbb;
input[2] = 0xcc;
ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
for (size_t i = 3; i < 10; i++) {
bn_assert_fits_in_bytes(x.get(), i);
}
for (size_t i = 0; i < 3; i++) {
EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
}
#endif
}
TEST(BNAssertTest, Assert_fits_in_bytes_zero) {
#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
bssl::UniquePtr<BIGNUM> x(BN_new());
uint8_t input[8];
OPENSSL_memset(input, 0, sizeof(input));
ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
for (size_t i = 0; i < 10; i++) {
bn_assert_fits_in_bytes(x.get(), i);
}
#endif
}
TEST(BNAssertTest, Assert_fits_in_bytes_boundary) {
#if !defined(NDEBUG) && !defined(OPENSSL_ANDROID)
bssl::UniquePtr<BIGNUM> x(BN_new());
uint8_t input[8];
OPENSSL_memset(input, 0, sizeof(input));
for (size_t i = 0; i < sizeof(input); i++) {
input[i] = i * (i + 1) & 0xff;
}
ASSERT_TRUE(BN_le2bn(input, sizeof(input), x.get()));
for (size_t i = 8; i < 18; i++) {
bn_assert_fits_in_bytes(x.get(), i);
}
for (size_t i = 0; i < 8; i++) {
EXPECT_DEATH_IF_SUPPORTED(bn_assert_fits_in_bytes(x.get(), i), "");
}
#endif
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,292 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include <limits.h>
#include "internal.h"
void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
size_t in_len) {
// The caller should have sized |out| to fit |in| without truncating. This
// condition ensures we do not overflow |out|, so use a runtime check.
BSSL_CHECK(in_len <= out_len * sizeof(BN_ULONG));
// Load whole words.
while (in_len >= sizeof(BN_ULONG)) {
in_len -= sizeof(BN_ULONG);
out[0] = CRYPTO_load_word_be(in + in_len);
out++;
out_len--;
}
// Load the last partial word.
if (in_len != 0) {
BN_ULONG word = 0;
for (size_t i = 0; i < in_len; i++) {
word = (word << 8) | in[i];
}
out[0] = word;
out++;
out_len--;
}
// Fill the remainder with zeros.
OPENSSL_memset(out, 0, out_len * sizeof(BN_ULONG));
}
BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
BIGNUM *bn = NULL;
if (ret == NULL) {
bn = BN_new();
if (bn == NULL) {
return NULL;
}
ret = bn;
}
if (len == 0) {
ret->width = 0;
return ret;
}
size_t num_words = ((len - 1) / BN_BYTES) + 1;
if (!bn_wexpand(ret, num_words)) {
BN_free(bn);
return NULL;
}
// |bn_wexpand| must check bounds on |num_words| to write it into
// |ret->dmax|.
assert(num_words <= INT_MAX);
ret->width = (int)num_words;
ret->neg = 0;
bn_big_endian_to_words(ret->d, ret->width, in, len);
return ret;
}
BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
BIGNUM *bn = NULL;
if (ret == NULL) {
bn = BN_new();
if (bn == NULL) {
return NULL;
}
ret = bn;
}
if (len == 0) {
ret->width = 0;
ret->neg = 0;
return ret;
}
// Reserve enough space in |ret|.
size_t num_words = ((len - 1) / BN_BYTES) + 1;
if (!bn_wexpand(ret, num_words)) {
BN_free(bn);
return NULL;
}
ret->width = (int)num_words;
bn_little_endian_to_words(ret->d, ret->width, in, len);
return ret;
}
void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, const size_t in_len) {
assert(out_len > 0);
#ifdef OPENSSL_BIG_ENDIAN
size_t in_index = 0;
for (size_t i = 0; i < out_len; i++) {
if ((in_len-in_index) < sizeof(BN_ULONG)) {
// Load the last partial word.
BN_ULONG word = 0;
// size_t is unsigned, so j >= 0 is always true.
for (size_t j = in_len-1; j >= in_index && j < in_len; j--) {
word = (word << 8) | in[j];
}
in_index = in_len;
out[i] = word;
// Fill the remainder with zeros.
OPENSSL_memset(out + i + 1, 0, (out_len - i - 1) * sizeof(BN_ULONG));
break;
}
out[i] = CRYPTO_load_word_le(in + in_index);
in_index += sizeof(BN_ULONG);
}
// The caller should have sized the output to avoid truncation.
assert(in_index == in_len);
#else
OPENSSL_memcpy(out, in, in_len);
// Fill the remainder with zeros.
OPENSSL_memset( ((uint8_t*)out) + in_len, 0, sizeof(BN_ULONG)*out_len - in_len);
#endif
}
// fits_in_bytes returns one if the |num_words| words in |words| can be
// represented in |num_bytes| bytes.
static int fits_in_bytes(const BN_ULONG *words, size_t num_words,
size_t num_bytes) {
uint8_t mask = 0;
#ifdef OPENSSL_BIG_ENDIAN
for (size_t i = num_bytes / BN_BYTES; i < num_words; i++) {
BN_ULONG word = words[i];
for (size_t j = 0; j < BN_BYTES; j++) {
if ((i * BN_BYTES) + j < num_bytes) {
// For the first word we don't need to check any bytes shorter than len
continue ;
} else {
mask |= (word >> (j * 8)) & 0xff;
}
}
}
#else
const uint8_t *bytes = (const uint8_t *)words;
size_t tot_bytes = num_words * sizeof(BN_ULONG);
for (size_t i = num_bytes; i < tot_bytes; i++) {
mask |= bytes[i];
}
#endif
return mask == 0;
}
// Asserts that the BIGNUM can be represented within |num| bytes.
// The logic is consistent with `fits_in_bytes` but assertions will fail when false.
void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num) {
const uint8_t *bytes = (const uint8_t *)bn->d;
size_t tot_bytes = bn->width * sizeof(BN_ULONG);
if (tot_bytes > num) {
CONSTTIME_DECLASSIFY(bytes + num, tot_bytes - num);
// Avoids compiler error: unused variable 'byte' or 'word'
// The assert statements below are only effective in DEBUG builds
#ifndef NDEBUG
#ifdef OPENSSL_BIG_ENDIAN
for (int i = num / BN_BYTES; i < bn->width; i++) {
BN_ULONG word = bn->d[i];
for (size_t j = 0; j < BN_BYTES; j++) {
if ((i * BN_BYTES) + j < num) {
// For the first word we don't need to check any bytes shorter than len
continue;
} else {
uint8_t byte = (word >> (j * 8)) & 0xff;
assert(byte == 0);
}
}
}
#else
for (size_t i = num; i < tot_bytes; i++) {
assert(bytes[i] == 0);
}
#endif
#endif
(void)bytes;
}
}
void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
size_t in_len) {
// The caller should have selected an output length without truncation.
declassify_assert(fits_in_bytes(in, in_len, out_len));
size_t num_bytes = in_len * sizeof(BN_ULONG);
if (out_len < num_bytes) {
num_bytes = out_len;
}
#ifdef OPENSSL_BIG_ENDIAN
for (size_t i = 0; i < num_bytes; i++) {
BN_ULONG l = in[i / BN_BYTES];
out[out_len - i - 1] = (uint8_t)(l >> (8 * (i % BN_BYTES))) & 0xff;
}
#else
const uint8_t *bytes = (const uint8_t *)in;
for (size_t i = 0; i < num_bytes; i++) {
out[out_len - i - 1] = bytes[i];
}
#endif
// Pad out the rest of the buffer with zeroes.
OPENSSL_memset(out, 0, out_len - num_bytes);
}
size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
size_t n = BN_num_bytes(in);
bn_words_to_big_endian(out, n, in->d, in->width);
return n;
}
void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len) {
// The caller should have selected an output length without truncation.
assert(fits_in_bytes(in, in_len, out_len));
size_t num_bytes = in_len * sizeof(BN_ULONG);
if (out_len < num_bytes) {
num_bytes = out_len;
}
#ifdef OPENSSL_BIG_ENDIAN
size_t byte_idx = 0;
for (size_t word_idx = 0; word_idx < in_len; word_idx++) {
BN_ULONG l = in[word_idx];
for(size_t j = 0; j < BN_BYTES && byte_idx < num_bytes; j++) {
out[byte_idx] = (uint8_t)(l & 0xff);
l >>= 8;
byte_idx++;
}
}
#else
const uint8_t *bytes = (const uint8_t *)in;
OPENSSL_memcpy(out, bytes, num_bytes);
#endif
// Fill the remainder with zeros.
OPENSSL_memset(out + num_bytes, 0, out_len - num_bytes);
}
int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) {
if (!fits_in_bytes(in->d, in->width, len)) {
return 0;
}
bn_words_to_little_endian(out, len, in->d, in->width);
return 1;
}
int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) {
if (!fits_in_bytes(in->d, in->width, len)) {
return 0;
}
bn_words_to_big_endian(out, len, in->d, in->width);
return 1;
}
BN_ULONG BN_get_word(const BIGNUM *bn) {
switch (bn_minimal_width(bn)) {
case 0:
return 0;
case 1:
return bn->d[0];
default:
return BN_MASK2;
}
}
int BN_get_u64(const BIGNUM *bn, uint64_t *out) {
switch (bn_minimal_width(bn)) {
case 0:
*out = 0;
return 1;
case 1:
*out = bn->d[0];
return 1;
#if defined(OPENSSL_32_BIT)
case 2:
*out = (uint64_t) bn->d[0] | (((uint64_t) bn->d[1]) << 32);
return 1;
#endif
default:
return 0;
}
}

View File

@@ -0,0 +1,147 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <openssl/mem.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../../internal.h"
static int bn_cmp_words_consttime(const BN_ULONG *a, size_t a_len,
const BN_ULONG *b, size_t b_len) {
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
int ret = 0;
// Process the common words in little-endian order.
size_t min = a_len < b_len ? a_len : b_len;
for (size_t i = 0; i < min; i++) {
crypto_word_t eq = constant_time_eq_w(a[i], b[i]);
crypto_word_t lt = constant_time_lt_w(a[i], b[i]);
ret =
constant_time_select_int(eq, ret, constant_time_select_int(lt, -1, 1));
}
// If |a| or |b| has non-zero words beyond |min|, they take precedence.
if (a_len < b_len) {
crypto_word_t mask = 0;
for (size_t i = a_len; i < b_len; i++) {
mask |= b[i];
}
ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, -1);
} else if (b_len < a_len) {
crypto_word_t mask = 0;
for (size_t i = b_len; i < a_len; i++) {
mask |= a[i];
}
ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1);
}
return ret;
}
int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
return bn_cmp_words_consttime(a->d, a->width, b->d, b->width);
}
int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
if ((a == NULL) || (b == NULL)) {
if (a != NULL) {
return -1;
} else if (b != NULL) {
return 1;
} else {
return 0;
}
}
// We do not attempt to process the sign bit in constant time. Negative
// |BIGNUM|s should never occur in crypto, only calculators.
if (a->neg != b->neg) {
if (a->neg) {
return -1;
}
return 1;
}
int ret = BN_ucmp(a, b);
return a->neg ? -ret : ret;
}
int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) {
return bn_cmp_words_consttime(a, len, b, len) < 0;
}
int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
if (bn->width == 0) {
return w == 0;
}
BN_ULONG mask = bn->d[0] ^ w;
for (int i = 1; i < bn->width; i++) {
mask |= bn->d[i];
}
return mask == 0;
}
int BN_cmp_word(const BIGNUM *a, BN_ULONG b) {
BIGNUM b_bn;
BN_init(&b_bn);
b_bn.d = &b;
b_bn.width = b > 0;
b_bn.dmax = 1;
b_bn.flags = BN_FLG_STATIC_DATA;
return BN_cmp(a, &b_bn);
}
int BN_is_zero(const BIGNUM *bn) {
return bn_fits_in_words(bn, 0);
}
int BN_is_one(const BIGNUM *bn) {
return bn->neg == 0 && BN_abs_is_word(bn, 1);
}
int BN_is_word(const BIGNUM *bn, BN_ULONG w) {
return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0);
}
int BN_is_odd(const BIGNUM *bn) {
return bn->width > 0 && (bn->d[0] & 1) == 1;
}
int BN_is_pow2(const BIGNUM *bn) {
int width = bn_minimal_width(bn);
if (width == 0 || bn->neg) {
return 0;
}
for (int i = 0; i < width - 1; i++) {
if (bn->d[i] != 0) {
return 0;
}
}
return 0 == (bn->d[width-1] & (bn->d[width-1] - 1));
}
int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) {
BN_ULONG mask = 0;
// If |a| or |b| has more words than the other, all those words must be zero.
for (int i = a->width; i < b->width; i++) {
mask |= b->d[i];
}
for (int i = b->width; i < a->width; i++) {
mask |= a->d[i];
}
// Common words must match.
int min = a->width < b->width ? a->width : b->width;
for (int i = 0; i < min; i++) {
mask |= (a->d[i] ^ b->d[i]);
}
// The sign bit must match.
mask |= (a->neg ^ b->neg);
return mask == 0;
}

View File

@@ -0,0 +1,182 @@
// Written by Ulf Moeller for the OpenSSL project.
// Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include "../../internal.h"
// The stack frame info is resizing, set a first-time expansion size;
#define BN_CTX_START_FRAMES 32
// BN_STACK
// A |BN_STACK| is a stack of |size_t| values.
typedef struct {
// Array of indexes into |ctx->bignums|.
size_t *indexes;
// Number of stack frames, and the size of the allocated array
size_t depth, size;
} BN_STACK;
static void BN_STACK_init(BN_STACK *);
static void BN_STACK_cleanup(BN_STACK *);
static int BN_STACK_push(BN_STACK *, size_t idx);
static size_t BN_STACK_pop(BN_STACK *);
// BN_CTX
DEFINE_STACK_OF(BIGNUM)
// The opaque BN_CTX type
struct bignum_ctx {
// bignums is the stack of |BIGNUM|s managed by this |BN_CTX|.
STACK_OF(BIGNUM) *bignums;
// stack is the stack of |BN_CTX_start| frames. It is the value of |used| at
// the time |BN_CTX_start| was called.
BN_STACK stack;
// used is the number of |BIGNUM|s from |bignums| that have been used.
size_t used;
// error is one if any operation on this |BN_CTX| failed. All subsequent
// operations will fail.
char error;
// defer_error is one if an operation on this |BN_CTX| has failed, but no
// error has been pushed to the queue yet. This is used to defer errors from
// |BN_CTX_start| to |BN_CTX_get|.
char defer_error;
};
BN_CTX *BN_CTX_new(void) {
BN_CTX *ret = OPENSSL_zalloc(sizeof(BN_CTX));
if (!ret) {
return NULL;
}
// Initialise the structure
BN_STACK_init(&ret->stack);
return ret;
}
BN_CTX *BN_CTX_secure_new(void) { return BN_CTX_new(); }
void BN_CTX_free(BN_CTX *ctx) {
if (ctx == NULL) {
return;
}
// All |BN_CTX_start| calls must be matched with |BN_CTX_end|, otherwise the
// function may use more memory than expected, potentially without bound if
// done in a loop. Assert that all |BIGNUM|s have been released.
assert(ctx->used == 0 || ctx->error);
sk_BIGNUM_pop_free(ctx->bignums, BN_free);
BN_STACK_cleanup(&ctx->stack);
OPENSSL_free(ctx);
}
void BN_CTX_start(BN_CTX *ctx) {
if (ctx->error) {
// Once an operation has failed, |ctx->stack| no longer matches the number
// of |BN_CTX_end| calls to come. Do nothing.
return;
}
if (!BN_STACK_push(&ctx->stack, ctx->used)) {
ctx->error = 1;
// |BN_CTX_start| cannot fail, so defer the error to |BN_CTX_get|.
ctx->defer_error = 1;
}
}
BIGNUM *BN_CTX_get(BN_CTX *ctx) {
// Once any operation has failed, they all do.
if (ctx->error) {
if (ctx->defer_error) {
OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
ctx->defer_error = 0;
}
return NULL;
}
if (ctx->bignums == NULL) {
ctx->bignums = sk_BIGNUM_new_null();
if (ctx->bignums == NULL) {
ctx->error = 1;
return NULL;
}
}
if (ctx->used == sk_BIGNUM_num(ctx->bignums)) {
BIGNUM *bn = BN_new();
if (bn == NULL || !sk_BIGNUM_push(ctx->bignums, bn)) {
OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
BN_free(bn);
ctx->error = 1;
return NULL;
}
}
BIGNUM *ret = sk_BIGNUM_value(ctx->bignums, ctx->used);
BN_zero(ret);
// This is bounded by |sk_BIGNUM_num|, so it cannot overflow.
ctx->used++;
return ret;
}
void BN_CTX_end(BN_CTX *ctx) {
if (ctx->error) {
// Once an operation has failed, |ctx->stack| no longer matches the number
// of |BN_CTX_end| calls to come. Do nothing.
return;
}
ctx->used = BN_STACK_pop(&ctx->stack);
}
// BN_STACK
static void BN_STACK_init(BN_STACK *st) {
st->indexes = NULL;
st->depth = st->size = 0;
}
static void BN_STACK_cleanup(BN_STACK *st) {
OPENSSL_free(st->indexes);
}
static int BN_STACK_push(BN_STACK *st, size_t idx) {
if (st->depth == st->size) {
// This function intentionally does not push to the error queue on error.
// Error-reporting is deferred to |BN_CTX_get|.
size_t new_size = st->size != 0 ? st->size * 3 / 2 : BN_CTX_START_FRAMES;
if (new_size <= st->size || new_size > SIZE_MAX / sizeof(size_t)) {
return 0;
}
size_t *new_indexes =
OPENSSL_realloc(st->indexes, new_size * sizeof(size_t));
if (new_indexes == NULL) {
return 0;
}
st->indexes = new_indexes;
st->size = new_size;
}
st->indexes[st->depth] = idx;
st->depth++;
return 1;
}
static size_t BN_STACK_pop(BN_STACK *st) {
assert(st->depth > 0);
st->depth--;
return st->indexes[st->depth];
}

View File

@@ -0,0 +1,856 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include <limits.h>
#include <openssl/err.h>
#include "internal.h"
// bn_div_words divides a double-width |h|,|l| by |d| and returns the result,
// which must fit in a |BN_ULONG|.
OPENSSL_UNUSED static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l,
BN_ULONG d) {
BN_ULONG dh, dl, q, ret = 0, th, tl, t;
int i, count = 2;
if (d == 0) {
return BN_MASK2;
}
i = BN_num_bits_word(d);
assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
i = BN_BITS2 - i;
if (h >= d) {
h -= d;
}
if (i) {
d <<= i;
h = (h << i) | (l >> (BN_BITS2 - i));
l <<= i;
}
dh = (d & BN_MASK2h) >> BN_BITS4;
dl = (d & BN_MASK2l);
for (;;) {
if ((h >> BN_BITS4) == dh) {
q = BN_MASK2l;
} else {
q = h / dh;
}
th = q * dh;
tl = dl * q;
for (;;) {
t = h - th;
if ((t & BN_MASK2h) ||
((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
break;
}
q--;
th -= dh;
tl -= dl;
}
t = (tl >> BN_BITS4);
tl = (tl << BN_BITS4) & BN_MASK2h;
th += t;
if (l < tl) {
th++;
}
l -= tl;
if (h < th) {
h += d;
q--;
}
h -= th;
if (--count == 0) {
break;
}
ret = q << BN_BITS4;
h = (h << BN_BITS4) | (l >> BN_BITS4);
l = (l & BN_MASK2l) << BN_BITS4;
}
ret |= q;
return ret;
}
static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
// GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when
// the |BN_ULLONG|-based C code is used.
//
// GCC bugs:
// * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
// * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721
// * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183
// * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897
// * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
//
// Clang bugs:
// * https://llvm.org/bugs/show_bug.cgi?id=6397
// * https://llvm.org/bugs/show_bug.cgi?id=12418
//
// These issues aren't specific to x86 and x86_64, so it might be worthwhile
// to add more assembly language implementations.
#if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86)
__asm__ volatile("divl %4"
: "=a"(*quotient_out), "=d"(*rem_out)
: "a"(n1), "d"(n0), "rm"(d0)
: "cc");
#elif defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86_64)
__asm__ volatile("divq %4"
: "=a"(*quotient_out), "=d"(*rem_out)
: "a"(n1), "d"(n0), "rm"(d0)
: "cc");
#else
#if defined(BN_CAN_DIVIDE_ULLONG)
BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
*quotient_out = (BN_ULONG)(n / d0);
#else
*quotient_out = bn_div_words(n0, n1, d0);
#endif
*rem_out = n1 - (*quotient_out * d0);
#endif
}
// BN_div computes "quotient := numerator / divisor", rounding towards zero,
// and sets up |rem| such that "quotient * divisor + rem = numerator" holds.
//
// Thus:
//
// quotient->neg == numerator->neg ^ divisor->neg
// (unless the result is zero)
// rem->neg == numerator->neg
// (unless the remainder is zero)
//
// If |quotient| or |rem| is NULL, the respective value is not returned.
//
// This was specifically designed to contain fewer branches that may leak
// sensitive information; see "New Branch Prediction Vulnerabilities in OpenSSL
// and Necessary Software Countermeasures" by Onur Acıçmez, Shay Gueron, and
// Jean-Pierre Seifert.
int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
const BIGNUM *divisor, BN_CTX *ctx) {
int norm_shift, loop;
BIGNUM wnum;
BN_ULONG *resp, *wnump;
BN_ULONG d0, d1;
int num_n, div_n;
// This function relies on the historical minimal-width |BIGNUM| invariant.
// It is already not constant-time (constant-time reductions should use
// Montgomery logic), so we shrink all inputs and intermediate values to
// retain the previous behavior.
// Invalid zero-padding would have particularly bad consequences.
int numerator_width = bn_minimal_width(numerator);
int divisor_width = bn_minimal_width(divisor);
if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) ||
(divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) {
OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED);
return 0;
}
if (BN_is_zero(divisor)) {
OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
return 0;
}
BN_CTX_start(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
BIGNUM *snum = BN_CTX_get(ctx);
BIGNUM *sdiv = BN_CTX_get(ctx);
BIGNUM *res = NULL;
if (quotient == NULL) {
res = BN_CTX_get(ctx);
} else {
res = quotient;
}
if (sdiv == NULL || res == NULL) {
goto err;
}
// First we normalise the numbers
norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2);
if (!BN_lshift(sdiv, divisor, norm_shift)) {
goto err;
}
bn_set_minimal_width(sdiv);
sdiv->neg = 0;
norm_shift += BN_BITS2;
if (!BN_lshift(snum, numerator, norm_shift)) {
goto err;
}
bn_set_minimal_width(snum);
snum->neg = 0;
// Since we don't want to have special-case logic for the case where snum is
// larger than sdiv, we pad snum with enough zeroes without changing its
// value.
if (snum->width <= sdiv->width + 1) {
if (!bn_wexpand(snum, sdiv->width + 2)) {
goto err;
}
for (int i = snum->width; i < sdiv->width + 2; i++) {
snum->d[i] = 0;
}
snum->width = sdiv->width + 2;
} else {
if (!bn_wexpand(snum, snum->width + 1)) {
goto err;
}
snum->d[snum->width] = 0;
snum->width++;
}
div_n = sdiv->width;
num_n = snum->width;
loop = num_n - div_n;
// Lets setup a 'window' into snum
// This is the part that corresponds to the current
// 'area' being divided
wnum.neg = 0;
wnum.d = &(snum->d[loop]);
wnum.width = div_n;
// only needed when BN_ucmp messes up the values between width and max
wnum.dmax = snum->dmax - loop; // so we don't step out of bounds
// Get the top 2 words of sdiv
// div_n=sdiv->width;
d0 = sdiv->d[div_n - 1];
d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
// pointer to the 'top' of snum
wnump = &(snum->d[num_n - 1]);
// Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg|
// for later.
const int numerator_neg = numerator->neg;
res->neg = (numerator_neg ^ divisor->neg);
if (!bn_wexpand(res, loop + 1)) {
goto err;
}
res->width = loop - 1;
resp = &(res->d[loop - 1]);
// space for temp
if (!bn_wexpand(tmp, div_n + 1)) {
goto err;
}
// if res->width == 0 then clear the neg value otherwise decrease
// the resp pointer
if (res->width == 0) {
res->neg = 0;
} else {
resp--;
}
for (int i = 0; i < loop - 1; i++, wnump--, resp--) {
BN_ULONG q, l0;
// the first part of the loop uses the top two words of snum and sdiv to
// calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
BN_ULONG n0, n1, rm = 0;
n0 = wnump[0];
n1 = wnump[-1];
if (n0 == d0) {
q = BN_MASK2;
} else {
// n0 < d0
bn_div_rem_words(&q, &rm, n0, n1, d0);
#ifdef BN_ULLONG
BN_ULLONG t2 = (BN_ULLONG)d1 * q;
for (;;) {
if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) {
break;
}
q--;
rm += d0;
if (rm < d0) {
break; // don't let rm overflow
}
t2 -= d1;
}
#else // !BN_ULLONG
BN_ULONG t2l, t2h;
BN_UMULT_LOHI(t2l, t2h, d1, q);
for (;;) {
if (t2h < rm ||
(t2h == rm && t2l <= wnump[-2])) {
break;
}
q--;
rm += d0;
if (rm < d0) {
break; // don't let rm overflow
}
if (t2l < d1) {
t2h--;
}
t2l -= d1;
}
#endif // !BN_ULLONG
}
l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
tmp->d[div_n] = l0;
wnum.d--;
// ingore top values of the bignums just sub the two
// BN_ULONG arrays with bn_sub_words
if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
// Note: As we have considered only the leading
// two BN_ULONGs in the calculation of q, sdiv * q
// might be greater than wnum (but then (q-1) * sdiv
// is less or equal than wnum)
q--;
if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) {
// we can't have an overflow here (assuming
// that q != 0, but if q == 0 then tmp is
// zero anyway)
(*wnump)++;
}
}
// store part of the result
*resp = q;
}
bn_set_minimal_width(snum);
if (rem != NULL) {
if (!BN_rshift(rem, snum, norm_shift)) {
goto err;
}
if (!BN_is_zero(rem)) {
rem->neg = numerator_neg;
}
}
bn_set_minimal_width(res);
BN_CTX_end(ctx);
return 1;
err:
BN_CTX_end(ctx);
return 0;
}
int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
if (!(BN_mod(r, m, d, ctx))) {
return 0;
}
if (!r->neg) {
return 1;
}
// now -|d| < r < 0, so we have to set r := r + |d|.
return (d->neg ? BN_sub : BN_add)(r, r, d);
}
BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
const BN_ULONG *m, size_t num) {
assert(r != a);
// |r| = |a| - |m|. |bn_sub_words| performs the bulk of the subtraction, and
// then we apply the borrow to |carry|.
carry -= bn_sub_words(r, a, m, num);
// We know 0 <= |a| < 2*|m|, so -|m| <= |r| < |m|.
//
// If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then
// wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to
// return |r| + |m|, or |a|. |carry| must then be -1 or all ones. In both
// cases, |carry| is a suitable input to |bn_select_words|.
//
// Although |carry| may be one if it was one on input and |bn_sub_words|
// returns zero, this would give |r| > |m|, violating our input assumptions.
declassify_assert(carry + 1 <= 1);
bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num);
return carry;
}
BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
BN_ULONG *tmp, size_t num) {
// See |bn_reduce_once| for why this logic works.
carry -= bn_sub_words(tmp, r, m, num);
declassify_assert(carry + 1 <= 1);
bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
return carry;
}
void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
// r = a - b
BN_ULONG borrow = bn_sub_words(r, a, b, num);
// tmp = a - b + m
bn_add_words(tmp, r, m, num);
bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num);
}
void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num) {
BN_ULONG carry = bn_add_words(r, a, b, num);
bn_reduce_once_in_place(r, carry, m, tmp, num);
}
int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
const BIGNUM *numerator, const BIGNUM *divisor,
unsigned divisor_min_bits, BN_CTX *ctx) {
if (BN_is_negative(numerator) || BN_is_negative(divisor)) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
if (BN_is_zero(divisor)) {
OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
return 0;
}
// This function implements long division in binary. It is not very efficient,
// but it is simple, easy to make constant-time, and performant enough for RSA
// key generation.
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *q = quotient, *r = remainder;
if (quotient == NULL || quotient == numerator || quotient == divisor) {
q = BN_CTX_get(ctx);
}
if (remainder == NULL || remainder == numerator || remainder == divisor) {
r = BN_CTX_get(ctx);
}
BIGNUM *tmp = BN_CTX_get(ctx);
if (q == NULL || r == NULL || tmp == NULL ||
!bn_wexpand(q, numerator->width) ||
!bn_wexpand(r, divisor->width) ||
!bn_wexpand(tmp, divisor->width)) {
goto err;
}
OPENSSL_memset(q->d, 0, numerator->width * sizeof(BN_ULONG));
q->width = numerator->width;
q->neg = 0;
OPENSSL_memset(r->d, 0, divisor->width * sizeof(BN_ULONG));
r->width = divisor->width;
r->neg = 0;
// Incorporate |numerator| into |r|, one bit at a time, reducing after each
// step. We maintain the invariant that |0 <= r < divisor| and
// |q * divisor + r = n| where |n| is the portion of |numerator| incorporated
// so far.
//
// First, we short-circuit the loop: if we know |divisor| has at least
// |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated
// without reductions. This significantly speeds up |RSA_check_key|. For
// simplicity, we round down to a whole number of words.
declassify_assert(divisor_min_bits <= BN_num_bits(divisor));
int initial_words = 0;
if (divisor_min_bits > 0) {
initial_words = (divisor_min_bits - 1) / BN_BITS2;
if (initial_words > numerator->width) {
initial_words = numerator->width;
}
OPENSSL_memcpy(r->d, numerator->d + numerator->width - initial_words,
initial_words * sizeof(BN_ULONG));
}
for (int i = numerator->width - initial_words - 1; i >= 0; i--) {
for (int bit = BN_BITS2 - 1; bit >= 0; bit--) {
// Incorporate the next bit of the numerator, by computing
// r = 2*r or 2*r + 1. Note the result fits in one more word. We store the
// extra word in |carry|.
BN_ULONG carry = bn_add_words(r->d, r->d, r->d, divisor->width);
r->d[0] |= (numerator->d[i] >> bit) & 1;
// |r| was previously fully-reduced, so we know:
// 2*0 <= r <= 2*(divisor-1) + 1
// 0 <= r <= 2*divisor - 1 < 2*divisor.
// Thus |r| satisfies the preconditions for |bn_reduce_once_in_place|.
BN_ULONG subtracted = bn_reduce_once_in_place(r->d, carry, divisor->d,
tmp->d, divisor->width);
// The corresponding bit of the quotient is set iff we needed to subtract.
q->d[i] |= (~subtracted & 1) << bit;
}
}
if ((quotient != NULL && !BN_copy(quotient, q)) ||
(remainder != NULL && !BN_copy(remainder, r))) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) {
BIGNUM *ret = BN_CTX_get(ctx);
if (ret == NULL ||
!bn_wexpand(ret, width)) {
return NULL;
}
ret->neg = 0;
ret->width = (int)width;
return ret;
}
// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on
// error. This is so it may be used with low-level "words" functions. If
// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope
// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in
// |width| words.
static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width,
BN_CTX *ctx) {
if ((size_t)bn->width >= width) {
// Any excess words must be zero.
assert(bn_fits_in_words(bn, width));
return bn;
}
BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx);
if (ret == NULL ||
!BN_copy(ret, bn) ||
!bn_resize_words(ret, width)) {
return NULL;
}
return ret;
}
int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx) {
if (!BN_add(r, a, b)) {
return 0;
}
return BN_nnmod(r, r, m, ctx);
}
int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m) {
BN_CTX *ctx = BN_CTX_new();
int ok = ctx != NULL &&
bn_mod_add_consttime(r, a, b, m, ctx);
BN_CTX_free(ctx);
return ok;
}
int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m, BN_CTX *ctx) {
BN_CTX_start(ctx);
a = bn_resized_from_ctx(a, m->width, ctx);
b = bn_resized_from_ctx(b, m->width, ctx);
BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
int ok = a != NULL && b != NULL && tmp != NULL &&
bn_wexpand(r, m->width);
if (ok) {
bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
r->width = m->width;
r->neg = 0;
}
BN_CTX_end(ctx);
return ok;
}
int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx) {
if (!BN_sub(r, a, b)) {
return 0;
}
return BN_nnmod(r, r, m, ctx);
}
int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m, BN_CTX *ctx) {
BN_CTX_start(ctx);
a = bn_resized_from_ctx(a, m->width, ctx);
b = bn_resized_from_ctx(b, m->width, ctx);
BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
int ok = a != NULL && b != NULL && tmp != NULL &&
bn_wexpand(r, m->width);
if (ok) {
bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width);
r->width = m->width;
r->neg = 0;
}
BN_CTX_end(ctx);
return ok;
}
int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m) {
BN_CTX *ctx = BN_CTX_new();
int ok = ctx != NULL &&
bn_mod_sub_consttime(r, a, b, m, ctx);
BN_CTX_free(ctx);
return ok;
}
int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx) {
BIGNUM *t;
int ret = 0;
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (t == NULL) {
goto err;
}
if (a == b) {
if (!BN_sqr(t, a, ctx)) {
goto err;
}
} else {
if (!BN_mul(t, a, b, ctx)) {
goto err;
}
}
if (!BN_nnmod(r, t, m, ctx)) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
if (!BN_sqr(r, a, ctx)) {
return 0;
}
// r->neg == 0, thus we don't need BN_nnmod
return BN_mod(r, r, m, ctx);
}
int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx) {
BIGNUM *abs_m = NULL;
int ret;
if (!BN_nnmod(r, a, m, ctx)) {
return 0;
}
if (m->neg) {
abs_m = BN_dup(m);
if (abs_m == NULL) {
return 0;
}
abs_m->neg = 0;
}
ret = bn_mod_lshift_consttime(r, r, n, (abs_m ? abs_m : m), ctx);
BN_free(abs_m);
return ret;
}
int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx) {
if (!BN_copy(r, a) ||
!bn_resize_words(r, m->width)) {
return 0;
}
BN_CTX_start(ctx);
BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx);
int ok = tmp != NULL;
if (ok) {
for (int i = 0; i < n; i++) {
bn_mod_add_words(r->d, r->d, r->d, m->d, tmp->d, m->width);
}
r->neg = 0;
}
BN_CTX_end(ctx);
return ok;
}
int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
BN_CTX *ctx = BN_CTX_new();
int ok = ctx != NULL &&
bn_mod_lshift_consttime(r, a, n, m, ctx);
BN_CTX_free(ctx);
return ok;
}
int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
if (!BN_lshift1(r, a)) {
return 0;
}
return BN_nnmod(r, r, m, ctx);
}
int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
BN_CTX *ctx) {
return bn_mod_add_consttime(r, a, a, m, ctx);
}
int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
BN_CTX *ctx = BN_CTX_new();
int ok = ctx != NULL &&
bn_mod_lshift1_consttime(r, a, m, ctx);
BN_CTX_free(ctx);
return ok;
}
BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
BN_ULONG ret = 0;
int i, j;
if (!w) {
// actually this an error (division by zero)
return (BN_ULONG) - 1;
}
if (a->width == 0) {
return 0;
}
// normalize input for |bn_div_rem_words|.
j = BN_BITS2 - BN_num_bits_word(w);
w <<= j;
if (!BN_lshift(a, a, j)) {
return (BN_ULONG) - 1;
}
for (i = a->width - 1; i >= 0; i--) {
BN_ULONG l = a->d[i];
BN_ULONG d;
BN_ULONG unused_rem;
bn_div_rem_words(&d, &unused_rem, ret, l, w);
ret = l - (d * w);
a->d[i] = d;
}
bn_set_minimal_width(a);
ret >>= j;
return ret;
}
BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
#ifndef BN_CAN_DIVIDE_ULLONG
BN_ULONG ret = 0;
#else
BN_ULLONG ret = 0;
#endif
int i;
if (w == 0) {
return (BN_ULONG) -1;
}
#ifndef BN_CAN_DIVIDE_ULLONG
// If |w| is too long and we don't have |BN_ULLONG| division then we need to
// fall back to using |BN_div_word|.
if (w > ((BN_ULONG)1 << BN_BITS4)) {
BIGNUM *tmp = BN_dup(a);
if (tmp == NULL) {
return (BN_ULONG)-1;
}
ret = BN_div_word(tmp, w);
BN_free(tmp);
return ret;
}
#endif
for (i = a->width - 1; i >= 0; i--) {
#ifndef BN_CAN_DIVIDE_ULLONG
ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
#else
ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w);
#endif
}
return (BN_ULONG)ret;
}
int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
if (e == 0 || a->width == 0) {
BN_zero(r);
return 1;
}
size_t num_words = 1 + ((e - 1) / BN_BITS2);
// If |a| definitely has less than |e| bits, just BN_copy.
if ((size_t) a->width < num_words) {
return BN_copy(r, a) != NULL;
}
// Otherwise, first make sure we have enough space in |r|.
// Note that this will fail if num_words > INT_MAX.
if (!bn_wexpand(r, num_words)) {
return 0;
}
// Copy the content of |a| into |r|.
OPENSSL_memcpy(r->d, a->d, num_words * sizeof(BN_ULONG));
// If |e| isn't word-aligned, we have to mask off some of our bits.
size_t top_word_exponent = e % (sizeof(BN_ULONG) * 8);
if (top_word_exponent != 0) {
r->d[num_words - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
}
// Fill in the remaining fields of |r|.
r->neg = a->neg;
r->width = (int) num_words;
bn_set_minimal_width(r);
return 1;
}
int BN_nnmod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) {
if (!BN_mod_pow2(r, a, e)) {
return 0;
}
// If the returned value was non-negative, we're done.
if (BN_is_zero(r) || !r->neg) {
return 1;
}
size_t num_words = 1 + (e - 1) / BN_BITS2;
// Expand |r| to the size of our modulus.
if (!bn_wexpand(r, num_words)) {
return 0;
}
// Clear the upper words of |r|.
OPENSSL_memset(&r->d[r->width], 0, (num_words - r->width) * BN_BYTES);
// Set parameters of |r|.
r->neg = 0;
r->width = (int) num_words;
// Now, invert every word. The idea here is that we want to compute 2^e-|x|,
// which is actually equivalent to the twos-complement representation of |x|
// in |e| bits, which is -x = ~x + 1.
for (int i = 0; i < r->width; i++) {
r->d[i] = ~r->d[i];
}
// If our exponent doesn't span the top word, we have to mask the rest.
size_t top_word_exponent = e % BN_BITS2;
if (top_word_exponent != 0) {
r->d[r->width - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1;
}
// Keep the minimal-width invariant for |BIGNUM|.
bn_set_minimal_width(r);
// Finally, add one, for the reason described above.
return BN_add(r, r, BN_value_one());
}

View File

@@ -0,0 +1,76 @@
// Copyright (c) 2018, Google Inc.
// SPDX-License-Identifier: ISC
#include <openssl/bn.h>
#include <assert.h>
#include "internal.h"
// The following functions use a Barrett reduction variant to avoid leaking the
// numerator. See http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
//
// We use 32-bit numerator and 16-bit divisor for simplicity. This allows
// computing |m| and |q| without architecture-specific code.
// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see
// reference). For proof of correctness in Coq, see
// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v
// Note the Coq version of |mod_u16| additionally includes the computation of
// |p| and |m| from |bn_mod_u16_consttime| below.
static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) {
// Compute floor(n/d) per steps 3 through 5.
uint32_t q = ((uint64_t)m * n) >> 32;
// Note there is a typo in the reference. We right-shift by one, not two.
uint32_t t = ((n - q) >> 1) + q;
t = t >> (p - 1);
// Multiply and subtract to get the remainder.
n -= d * t;
declassify_assert(n < d);
return n;
}
// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the
// "magic numbers" for |d| (see reference).
static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d,
uint32_t p, uint32_t m) {
// Incorporate |a| in two 16-bit chunks.
uint32_t t = r;
t <<= 16;
t |= a >> 16;
t = mod_u16(t, d, p, m);
t <<= 16;
t |= a & 0xffff;
t = mod_u16(t, d, p, m);
return t;
}
uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) {
if (d <= 1) {
return 0;
}
// Compute the "magic numbers" for |d|. See steps 1 and 2.
// This computes p = ceil(log_2(d)).
uint32_t p = BN_num_bits_word(d - 1);
// This operation is not constant-time, but |p| and |d| are public values.
// Note that |p| is at most 16, so the computation fits in |uint64_t|.
assert(p <= 16);
uint32_t m = (uint32_t)(((UINT64_C(1) << (32 + p)) + d - 1) / d);
uint16_t ret = 0;
for (int i = bn->width - 1; i >= 0; i--) {
#if BN_BITS2 == 32
ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m);
#elif BN_BITS2 == 64
ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m);
ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m);
#else
#error "Unknown BN_ULONG size"
#endif
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,293 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
// Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <openssl/err.h>
#include "internal.h"
int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
const BIGNUM *n, BN_CTX *ctx) {
*out_no_inverse = 0;
if (!BN_is_odd(n)) {
OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
return 0;
}
if (BN_is_negative(a) || BN_cmp(a, n) >= 0) {
OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
return 0;
}
BIGNUM *A, *B, *X, *Y;
int ret = 0;
int sign;
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
X = BN_CTX_get(ctx);
Y = BN_CTX_get(ctx);
if (Y == NULL) {
goto err;
}
BIGNUM *R = out;
BN_zero(Y);
if (!BN_one(X) || BN_copy(B, a) == NULL || BN_copy(A, n) == NULL) {
goto err;
}
A->neg = 0;
sign = -1;
// From B = a mod |n|, A = |n| it follows that
//
// 0 <= B < A,
// -sign*X*a == B (mod |n|),
// sign*Y*a == A (mod |n|).
// Binary inversion algorithm; requires odd modulus. This is faster than the
// general algorithm if the modulus is sufficiently small (about 400 .. 500
// bits on 32-bit systems, but much more on 64-bit systems)
int shift;
while (!BN_is_zero(B)) {
// 0 < B < |n|,
// 0 < A <= |n|,
// (1) -sign*X*a == B (mod |n|),
// (2) sign*Y*a == A (mod |n|)
// Now divide B by the maximum possible power of two in the integers,
// and divide X by the same value mod |n|.
// When we're done, (1) still holds.
shift = 0;
while (!BN_is_bit_set(B, shift)) {
// note that 0 < B
shift++;
if (BN_is_odd(X)) {
if (!BN_uadd(X, X, n)) {
goto err;
}
}
// now X is even, so we can easily divide it by two
if (!BN_rshift1(X, X)) {
goto err;
}
}
if (shift > 0) {
if (!BN_rshift(B, B, shift)) {
goto err;
}
}
// Same for A and Y. Afterwards, (2) still holds.
shift = 0;
while (!BN_is_bit_set(A, shift)) {
// note that 0 < A
shift++;
if (BN_is_odd(Y)) {
if (!BN_uadd(Y, Y, n)) {
goto err;
}
}
// now Y is even
if (!BN_rshift1(Y, Y)) {
goto err;
}
}
if (shift > 0) {
if (!BN_rshift(A, A, shift)) {
goto err;
}
}
// We still have (1) and (2).
// Both A and B are odd.
// The following computations ensure that
//
// 0 <= B < |n|,
// 0 < A < |n|,
// (1) -sign*X*a == B (mod |n|),
// (2) sign*Y*a == A (mod |n|),
//
// and that either A or B is even in the next iteration.
if (BN_ucmp(B, A) >= 0) {
// -sign*(X + Y)*a == B - A (mod |n|)
if (!BN_uadd(X, X, Y)) {
goto err;
}
// NB: we could use BN_mod_add_quick(X, X, Y, n), but that
// actually makes the algorithm slower
if (!BN_usub(B, B, A)) {
goto err;
}
} else {
// sign*(X + Y)*a == A - B (mod |n|)
if (!BN_uadd(Y, Y, X)) {
goto err;
}
// as above, BN_mod_add_quick(Y, Y, X, n) would slow things down
if (!BN_usub(A, A, B)) {
goto err;
}
}
}
if (!BN_is_one(A)) {
*out_no_inverse = 1;
OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
goto err;
}
// The while loop (Euclid's algorithm) ends when
// A == gcd(a,n);
// we have
// sign*Y*a == A (mod |n|),
// where Y is non-negative.
if (sign < 0) {
if (!BN_sub(Y, n, Y)) {
goto err;
}
}
// Now Y*a == A (mod |n|).
// Y*a == 1 (mod |n|)
if (Y->neg || BN_ucmp(Y, n) >= 0) {
if (!BN_nnmod(Y, Y, n, ctx)) {
goto err;
}
}
if (!BN_copy(R, Y)) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx) {
BIGNUM *new_out = NULL;
if (out == NULL) {
new_out = BN_new();
if (new_out == NULL) {
return NULL;
}
out = new_out;
}
int ok = 0;
BIGNUM *a_reduced = NULL;
if (a->neg || BN_ucmp(a, n) >= 0) {
a_reduced = BN_dup(a);
if (a_reduced == NULL) {
goto err;
}
if (!BN_nnmod(a_reduced, a_reduced, n, ctx)) {
goto err;
}
a = a_reduced;
}
int no_inverse;
if (!BN_is_odd(n)) {
if (!bn_mod_inverse_consttime(out, &no_inverse, a, n, ctx)) {
goto err;
}
} else if (!BN_mod_inverse_odd(out, &no_inverse, a, n, ctx)) {
goto err;
}
ok = 1;
err:
if (!ok) {
BN_free(new_out);
out = NULL;
}
BN_free(a_reduced);
return out;
}
int BN_mod_inverse_blinded(BIGNUM *out, int *out_no_inverse, const BIGNUM *a,
const BN_MONT_CTX *mont, BN_CTX *ctx) {
*out_no_inverse = 0;
// |a| is secret, but it is required to be in range, so these comparisons may
// be leaked.
if (BN_is_negative(a) ||
constant_time_declassify_int(BN_cmp(a, &mont->N) >= 0)) {
OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
return 0;
}
int ret = 0;
BIGNUM blinding_factor;
BN_init(&blinding_factor);
// |BN_mod_inverse_odd| is leaky, so generate a secret blinding factor and
// blind |a|. This works because (ar)^-1 * r = a^-1, supposing r is
// invertible. If r is not invertible, this function will fail. However, we
// only use this in RSA, where stumbling on an uninvertible element means
// stumbling on the key's factorization. That is, if this function fails, the
// RSA key was not actually a product of two large primes.
//
// TODO(crbug.com/boringssl/677): When the PRNG output is marked secret by
// default, the explicit |bn_secret| call can be removed.
if (!BN_rand_range_ex(&blinding_factor, 1, &mont->N)) {
goto err;
}
bn_secret(&blinding_factor);
if (!BN_mod_mul_montgomery(out, &blinding_factor, a, mont, ctx)) {
goto err;
}
// Once blinded, |out| is no longer secret, so it may be passed to a leaky
// mod inverse function. Note |blinding_factor| is secret, so |out| will be
// secret again after multiplying.
bn_declassify(out);
if (!BN_mod_inverse_odd(out, out_no_inverse, out, &mont->N, ctx) ||
!BN_mod_mul_montgomery(out, &blinding_factor, out, mont, ctx)) {
goto err;
}
ret = 1;
err:
BN_free(&blinding_factor);
return ret;
}
int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p) {
BN_CTX_start(ctx);
BIGNUM *p_minus_2 = BN_CTX_get(ctx);
int ok = p_minus_2 != NULL &&
BN_copy(p_minus_2, p) &&
BN_sub_word(p_minus_2, 2) &&
BN_mod_exp_mont(out, a, p_minus_2, p, ctx, mont_p);
BN_CTX_end(ctx);
return ok;
}
int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p) {
BN_CTX_start(ctx);
BIGNUM *p_minus_2 = BN_CTX_get(ctx);
int ok = p_minus_2 != NULL &&
BN_copy(p_minus_2, p) &&
BN_sub_word(p_minus_2, 2) &&
BN_mod_exp_mont_consttime(out, a, p_minus_2, p, ctx, mont_p);
BN_CTX_end(ctx);
return ok;
}

View File

@@ -0,0 +1,320 @@
// Copyright (c) 2018, Google Inc.
// SPDX-License-Identifier: ISC
#include <openssl/bn.h>
#include <assert.h>
#include <openssl/err.h>
#include "internal.h"
static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); }
static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp,
size_t num) {
bn_rshift1_words(tmp, a, num);
bn_select_words(a, mask, tmp, a, num);
}
static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry,
BN_ULONG mask, BN_ULONG *tmp,
size_t num) {
maybe_rshift1_words(a, mask, tmp, num);
if (num != 0) {
carry &= mask;
a[num - 1] |= carry << (BN_BITS2-1);
}
}
static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b,
BN_ULONG *tmp, size_t num) {
BN_ULONG carry = bn_add_words(tmp, a, b, num);
bn_select_words(a, mask, tmp, a, num);
return carry & mask;
}
static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x,
const BIGNUM *y, BN_CTX *ctx) {
size_t width = x->width > y->width ? x->width : y->width;
if (width == 0) {
*out_shift = 0;
BN_zero(r);
return 1;
}
// This is a constant-time implementation of Stein's algorithm (binary GCD).
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *u = BN_CTX_get(ctx);
BIGNUM *v = BN_CTX_get(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
if (u == NULL || v == NULL || tmp == NULL ||
!BN_copy(u, x) ||
!BN_copy(v, y) ||
!bn_resize_words(u, width) ||
!bn_resize_words(v, width) ||
!bn_resize_words(tmp, width)) {
goto err;
}
// Each loop iteration halves at least one of |u| and |v|. Thus we need at
// most the combined bit width of inputs for at least one value to be zero.
unsigned x_bits = x->width * BN_BITS2, y_bits = y->width * BN_BITS2;
unsigned num_iters = x_bits + y_bits;
if (num_iters < x_bits) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
goto err;
}
unsigned shift = 0;
for (unsigned i = 0; i < num_iters; i++) {
BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
// If both |u| and |v| are odd, subtract the smaller from the larger.
BN_ULONG u_less_than_v =
(BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width);
bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width);
bn_sub_words(tmp->d, v->d, u->d, width);
bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width);
// At least one of |u| and |v| is now even.
BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]);
BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]);
declassify_assert(!(u_is_odd & v_is_odd));
// If both are even, the final GCD gains a factor of two.
shift += 1 & (~u_is_odd & ~v_is_odd);
// Halve any which are even.
maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width);
maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width);
}
// One of |u| or |v| is zero at this point. The algorithm usually makes |u|
// zero, unless |y| was already zero on input. Fix this by combining the
// values.
declassify_assert(BN_is_zero(u) | BN_is_zero(v));
for (size_t i = 0; i < width; i++) {
v->d[i] |= u->d[i];
}
*out_shift = shift;
ret = bn_set_words(r, v->d, width);
err:
BN_CTX_end(ctx);
return ret;
}
int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) {
unsigned shift;
return bn_gcd_consttime(r, &shift, x, y, ctx) &&
BN_lshift(r, r, shift);
}
int bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x,
const BIGNUM *y, BN_CTX *ctx) {
int ret = 0;
BN_CTX_start(ctx);
unsigned shift;
BIGNUM *gcd = BN_CTX_get(ctx);
if (gcd == NULL ||
!bn_gcd_consttime(gcd, &shift, x, y, ctx)) {
goto err;
}
// Check that 2^|shift| * |gcd| is one.
if (gcd->width == 0) {
*out_relatively_prime = 0;
} else {
BN_ULONG mask = shift | (gcd->d[0] ^ 1);
for (int i = 1; i < gcd->width; i++) {
mask |= gcd->d[i];
}
*out_relatively_prime = mask == 0;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
BN_CTX_start(ctx);
unsigned shift;
BIGNUM *gcd = BN_CTX_get(ctx);
int ret = gcd != NULL && //
bn_mul_consttime(r, a, b, ctx) &&
bn_gcd_consttime(gcd, &shift, a, b, ctx) &&
// |gcd| has a secret bit width.
bn_div_consttime(r, NULL, r, gcd, /*divisor_min_bits=*/0, ctx) &&
bn_rshift_secret_shift(r, r, shift, ctx);
BN_CTX_end(ctx);
return ret;
}
int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a,
const BIGNUM *n, BN_CTX *ctx) {
*out_no_inverse = 0;
if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) {
OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED);
return 0;
}
if (BN_is_zero(a)) {
if (BN_is_one(n)) {
BN_zero(r);
return 1;
}
*out_no_inverse = 1;
OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
return 0;
}
// This is a constant-time implementation of the extended binary GCD
// algorithm. It is adapted from the Handbook of Applied Cryptography, section
// 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid
// negative numbers.
//
// For more details and proof of correctness, see
// https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step|
// and |mod_inverse_consttime| for the algorithm in Gallina and see
// |mod_inverse_consttime_spec| for the correctness result.
if (!BN_is_odd(a) && !BN_is_odd(n)) {
*out_no_inverse = 1;
OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
return 0;
}
// This function exists to compute the RSA private exponent, where |a| is one
// word. We'll thus use |a_width| when available.
size_t n_width = n->width, a_width = a->width;
if (a_width > n_width) {
a_width = n_width;
}
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *u = BN_CTX_get(ctx);
BIGNUM *v = BN_CTX_get(ctx);
BIGNUM *A = BN_CTX_get(ctx);
BIGNUM *B = BN_CTX_get(ctx);
BIGNUM *C = BN_CTX_get(ctx);
BIGNUM *D = BN_CTX_get(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
BIGNUM *tmp2 = BN_CTX_get(ctx);
if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL ||
D == NULL || tmp == NULL || tmp2 == NULL ||
!BN_copy(u, a) ||
!BN_copy(v, n) ||
!BN_one(A) ||
!BN_one(D) ||
// For convenience, size |u| and |v| equivalently.
!bn_resize_words(u, n_width) ||
!bn_resize_words(v, n_width) ||
// |A| and |C| are bounded by |m|.
!bn_resize_words(A, n_width) ||
!bn_resize_words(C, n_width) ||
// |B| and |D| are bounded by |a|.
!bn_resize_words(B, a_width) ||
!bn_resize_words(D, a_width) ||
// |tmp| and |tmp2| may be used at either size.
!bn_resize_words(tmp, n_width) ||
!bn_resize_words(tmp2, n_width)) {
goto err;
}
// Each loop iteration halves at least one of |u| and |v|. Thus we need at
// most the combined bit width of inputs for at least one value to be zero.
// |a_bits| and |n_bits| cannot overflow because |bn_wexpand| ensures bit
// counts fit in even |int|.
size_t a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2;
size_t num_iters = a_bits + n_bits;
if (num_iters < a_bits) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
goto err;
}
// Before and after each loop iteration, the following hold:
//
// u = A*a - B*n
// v = D*n - C*a
// 0 < u <= a
// 0 <= v <= n
// 0 <= A < n
// 0 <= B <= a
// 0 <= C < n
// 0 <= D <= a
//
// After each loop iteration, u and v only get smaller, and at least one of
// them shrinks by at least a factor of two.
for (size_t i = 0; i < num_iters; i++) {
BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]);
// If both |u| and |v| are odd, subtract the smaller from the larger.
BN_ULONG v_less_than_u =
(BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width);
bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width);
bn_sub_words(tmp->d, u->d, v->d, n_width);
bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width);
// If we updated one of the values, update the corresponding coefficient.
BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width);
carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width);
bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width);
bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width);
bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width);
bn_add_words(tmp->d, B->d, D->d, a_width);
bn_sub_words(tmp2->d, tmp->d, a->d, a_width);
bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width);
bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width);
bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width);
// Our loop invariants hold at this point. Additionally, exactly one of |u|
// and |v| is now even.
BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]);
BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]);
declassify_assert(u_is_even != v_is_even);
// Halve the even one and adjust the corresponding coefficient.
maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width);
BN_ULONG A_or_B_is_odd =
word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]);
BN_ULONG A_carry =
maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width);
BN_ULONG B_carry =
maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width);
maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width);
maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width);
maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width);
BN_ULONG C_or_D_is_odd =
word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]);
BN_ULONG C_carry =
maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width);
BN_ULONG D_carry =
maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width);
maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width);
maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width);
}
declassify_assert(BN_is_zero(v));
// While the inputs and output are secret, this function considers whether the
// input was invertible to be public. It is used as part of RSA key
// generation, where inputs are chosen to already be invertible.
if (constant_time_declassify_int(!BN_is_one(u))) {
*out_no_inverse = 1;
OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE);
goto err;
}
ret = BN_copy(r, A) != NULL;
err:
BN_CTX_end(ctx);
return ret;
}

View File

@@ -0,0 +1,571 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include "internal.h"
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
// See asm/bn-586.pl.
#define BN_ADD_ASM
#define BN_MUL_ASM
#endif
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
(defined(__GNUC__) || defined(__clang__))
// See asm/x86_64-gcc.c
#define BN_ADD_ASM
#define BN_MUL_ASM
#endif
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
// See asm/bn-armv8.pl.
#define BN_ADD_ASM
#endif
#if !defined(BN_MUL_ASM)
#ifdef BN_ULLONG
#define mul_add(r, a, w, c) \
do { \
BN_ULLONG t; \
t = (BN_ULLONG)(w) * (a) + (r) + (c); \
(r) = Lw(t); \
(c) = Hw(t); \
} while (0)
#define mul(r, a, w, c) \
do { \
BN_ULLONG t; \
t = (BN_ULLONG)(w) * (a) + (c); \
(r) = Lw(t); \
(c) = Hw(t); \
} while (0)
#define sqr(r0, r1, a) \
do { \
BN_ULLONG t; \
t = (BN_ULLONG)(a) * (a); \
(r0) = Lw(t); \
(r1) = Hw(t); \
} while (0)
#else
#define mul_add(r, a, w, c) \
do { \
BN_ULONG high, low, ret, tmp = (a); \
ret = (r); \
BN_UMULT_LOHI(low, high, w, tmp); \
ret += (c); \
(c) = (ret < (c)) ? 1 : 0; \
(c) += high; \
ret += low; \
(c) += (ret < low) ? 1 : 0; \
(r) = ret; \
} while (0)
#define mul(r, a, w, c) \
do { \
BN_ULONG high, low, ret, ta = (a); \
BN_UMULT_LOHI(low, high, w, ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret < low) ? 1 : 0; \
(r) = ret; \
} while (0)
#define sqr(r0, r1, a) \
do { \
BN_ULONG tmp = (a); \
BN_UMULT_LOHI(r0, r1, tmp, tmp); \
} while (0)
#endif // !BN_ULLONG
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w) {
BN_ULONG c1 = 0;
if (num == 0) {
return c1;
}
while (num & ~3) {
mul_add(rp[0], ap[0], w, c1);
mul_add(rp[1], ap[1], w, c1);
mul_add(rp[2], ap[2], w, c1);
mul_add(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
while (num) {
mul_add(rp[0], ap[0], w, c1);
ap++;
rp++;
num--;
}
return c1;
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w) {
BN_ULONG c1 = 0;
if (num == 0) {
return c1;
}
while (num & ~3) {
mul(rp[0], ap[0], w, c1);
mul(rp[1], ap[1], w, c1);
mul(rp[2], ap[2], w, c1);
mul(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
while (num) {
mul(rp[0], ap[0], w, c1);
ap++;
rp++;
num--;
}
return c1;
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
if (n == 0) {
return;
}
while (n & ~3) {
sqr(r[0], r[1], a[0]);
sqr(r[2], r[3], a[1]);
sqr(r[4], r[5], a[2]);
sqr(r[6], r[7], a[3]);
a += 4;
r += 8;
n -= 4;
}
while (n) {
sqr(r[0], r[1], a[0]);
a++;
r += 2;
n--;
}
}
// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0)
// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0)
// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
#ifdef BN_ULLONG
// Keep in mind that additions to multiplication result can not overflow,
// because its high half cannot be all-ones.
#define mul_add_c(a, b, c0, c1, c2) \
do { \
BN_ULONG hi; \
BN_ULLONG t = (BN_ULLONG)(a) * (b); \
t += (c0); /* no carry */ \
(c0) = (BN_ULONG)Lw(t); \
hi = (BN_ULONG)Hw(t); \
(c1) += (hi); \
(c2) += (c1) < hi; \
} while (0)
#define mul_add_c2(a, b, c0, c1, c2) \
do { \
BN_ULONG hi; \
BN_ULLONG t = (BN_ULLONG)(a) * (b); \
BN_ULLONG tt = t + (c0); /* no carry */ \
(c0) = (BN_ULONG)Lw(tt); \
hi = (BN_ULONG)Hw(tt); \
(c1) += hi; \
(c2) += (c1) < hi; \
t += (c0); /* no carry */ \
(c0) = (BN_ULONG)Lw(t); \
hi = (BN_ULONG)Hw(t); \
(c1) += hi; \
(c2) += (c1) < hi; \
} while (0)
#define sqr_add_c(a, i, c0, c1, c2) \
do { \
BN_ULONG hi; \
BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \
t += (c0); /* no carry */ \
(c0) = (BN_ULONG)Lw(t); \
hi = (BN_ULONG)Hw(t); \
(c1) += hi; \
(c2) += (c1) < hi; \
} while (0)
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
#else
// Keep in mind that additions to hi can not overflow, because the high word of
// a multiplication result cannot be all-ones.
#define mul_add_c(a, b, c0, c1, c2) \
do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo, hi, ta, tb); \
(c0) += lo; \
hi += ((c0) < lo) ? 1 : 0; \
(c1) += hi; \
(c2) += ((c1) < hi) ? 1 : 0; \
} while (0)
#define mul_add_c2(a, b, c0, c1, c2) \
do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi, tt; \
BN_UMULT_LOHI(lo, hi, ta, tb); \
(c0) += lo; \
tt = hi + (((c0) < lo) ? 1 : 0); \
(c1) += tt; \
(c2) += ((c1) < tt) ? 1 : 0; \
(c0) += lo; \
hi += (c0 < lo) ? 1 : 0; \
(c1) += hi; \
(c2) += ((c1) < hi) ? 1 : 0; \
} while (0)
#define sqr_add_c(a, i, c0, c1, c2) \
do { \
BN_ULONG ta = (a)[i]; \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo, hi, ta, ta); \
(c0) += lo; \
hi += (c0 < lo) ? 1 : 0; \
(c1) += hi; \
(c2) += ((c1) < hi) ? 1 : 0; \
} while (0)
#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
#endif // !BN_ULLONG
void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[4], b[0], c2, c3, c1);
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
mul_add_c(a[0], b[4], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[0], b[5], c3, c1, c2);
mul_add_c(a[1], b[4], c3, c1, c2);
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
mul_add_c(a[4], b[1], c3, c1, c2);
mul_add_c(a[5], b[0], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[6], b[0], c1, c2, c3);
mul_add_c(a[5], b[1], c1, c2, c3);
mul_add_c(a[4], b[2], c1, c2, c3);
mul_add_c(a[3], b[3], c1, c2, c3);
mul_add_c(a[2], b[4], c1, c2, c3);
mul_add_c(a[1], b[5], c1, c2, c3);
mul_add_c(a[0], b[6], c1, c2, c3);
r[6] = c1;
c1 = 0;
mul_add_c(a[0], b[7], c2, c3, c1);
mul_add_c(a[1], b[6], c2, c3, c1);
mul_add_c(a[2], b[5], c2, c3, c1);
mul_add_c(a[3], b[4], c2, c3, c1);
mul_add_c(a[4], b[3], c2, c3, c1);
mul_add_c(a[5], b[2], c2, c3, c1);
mul_add_c(a[6], b[1], c2, c3, c1);
mul_add_c(a[7], b[0], c2, c3, c1);
r[7] = c2;
c2 = 0;
mul_add_c(a[7], b[1], c3, c1, c2);
mul_add_c(a[6], b[2], c3, c1, c2);
mul_add_c(a[5], b[3], c3, c1, c2);
mul_add_c(a[4], b[4], c3, c1, c2);
mul_add_c(a[3], b[5], c3, c1, c2);
mul_add_c(a[2], b[6], c3, c1, c2);
mul_add_c(a[1], b[7], c3, c1, c2);
r[8] = c3;
c3 = 0;
mul_add_c(a[2], b[7], c1, c2, c3);
mul_add_c(a[3], b[6], c1, c2, c3);
mul_add_c(a[4], b[5], c1, c2, c3);
mul_add_c(a[5], b[4], c1, c2, c3);
mul_add_c(a[6], b[3], c1, c2, c3);
mul_add_c(a[7], b[2], c1, c2, c3);
r[9] = c1;
c1 = 0;
mul_add_c(a[7], b[3], c2, c3, c1);
mul_add_c(a[6], b[4], c2, c3, c1);
mul_add_c(a[5], b[5], c2, c3, c1);
mul_add_c(a[4], b[6], c2, c3, c1);
mul_add_c(a[3], b[7], c2, c3, c1);
r[10] = c2;
c2 = 0;
mul_add_c(a[4], b[7], c3, c1, c2);
mul_add_c(a[5], b[6], c3, c1, c2);
mul_add_c(a[6], b[5], c3, c1, c2);
mul_add_c(a[7], b[4], c3, c1, c2);
r[11] = c3;
c3 = 0;
mul_add_c(a[7], b[5], c1, c2, c3);
mul_add_c(a[6], b[6], c1, c2, c3);
mul_add_c(a[5], b[7], c1, c2, c3);
r[12] = c1;
c1 = 0;
mul_add_c(a[6], b[7], c2, c3, c1);
mul_add_c(a[7], b[6], c2, c3, c1);
r[13] = c2;
c2 = 0;
mul_add_c(a[7], b[7], c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[3], b[3], c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
sqr_add_c2(a, 4, 0, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 5, 0, c3, c1, c2);
sqr_add_c2(a, 4, 1, c3, c1, c2);
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
sqr_add_c2(a, 4, 2, c1, c2, c3);
sqr_add_c2(a, 5, 1, c1, c2, c3);
sqr_add_c2(a, 6, 0, c1, c2, c3);
r[6] = c1;
c1 = 0;
sqr_add_c2(a, 7, 0, c2, c3, c1);
sqr_add_c2(a, 6, 1, c2, c3, c1);
sqr_add_c2(a, 5, 2, c2, c3, c1);
sqr_add_c2(a, 4, 3, c2, c3, c1);
r[7] = c2;
c2 = 0;
sqr_add_c(a, 4, c3, c1, c2);
sqr_add_c2(a, 5, 3, c3, c1, c2);
sqr_add_c2(a, 6, 2, c3, c1, c2);
sqr_add_c2(a, 7, 1, c3, c1, c2);
r[8] = c3;
c3 = 0;
sqr_add_c2(a, 7, 2, c1, c2, c3);
sqr_add_c2(a, 6, 3, c1, c2, c3);
sqr_add_c2(a, 5, 4, c1, c2, c3);
r[9] = c1;
c1 = 0;
sqr_add_c(a, 5, c2, c3, c1);
sqr_add_c2(a, 6, 4, c2, c3, c1);
sqr_add_c2(a, 7, 3, c2, c3, c1);
r[10] = c2;
c2 = 0;
sqr_add_c2(a, 7, 4, c3, c1, c2);
sqr_add_c2(a, 6, 5, c3, c1, c2);
r[11] = c3;
c3 = 0;
sqr_add_c(a, 6, c1, c2, c3);
sqr_add_c2(a, 7, 5, c1, c2, c3);
r[12] = c1;
c1 = 0;
sqr_add_c2(a, 7, 6, c2, c3, c1);
r[13] = c2;
c2 = 0;
sqr_add_c(a, 7, c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
#undef mul_add
#undef mul
#undef sqr
#undef mul_add_c
#undef mul_add_c2
#undef sqr_add_c
#undef sqr_add_c2
#endif // !BN_MUL_ASM
#if !defined(BN_ADD_ASM)
BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
size_t n) {
if (n == 0) {
return 0;
}
BN_ULONG carry = 0;
while (n & ~3) {
r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry);
r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry);
r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry);
a += 4;
b += 4;
r += 4;
n -= 4;
}
while (n) {
r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
a++;
b++;
r++;
n--;
}
return carry;
}
BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
size_t n) {
if (n == 0) {
return (BN_ULONG)0;
}
BN_ULONG borrow = 0;
while (n & ~3) {
r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow);
r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow);
r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow);
a += 4;
b += 4;
r += 4;
n -= 4;
}
while (n) {
r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
a++;
b++;
r++;
n--;
}
return borrow;
}
#endif // !BN_ADD_ASM

View File

@@ -0,0 +1,736 @@
// Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
// Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
// Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
//
// The binary polynomial arithmetic software is originally written by
// Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
// Laboratories.
//
// SPDX-License-Identifier: Apache-2.0
#ifndef OPENSSL_HEADER_BN_INTERNAL_H
#define OPENSSL_HEADER_BN_INTERNAL_H
#include <openssl/bn.h>
#include <openssl/rand.h>
#if defined(OPENSSL_X86_64) && defined(_MSC_VER)
OPENSSL_MSVC_PRAGMA(warning(push, 3))
#include <intrin.h>
OPENSSL_MSVC_PRAGMA(warning(pop))
#pragma intrinsic(__umulh, _umul128)
#endif
#include "../../internal.h"
#include "../cpucap/internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
#if defined(OPENSSL_64_BIT)
#if defined(BORINGSSL_HAS_UINT128)
// MSVC doesn't support two-word integers on 64-bit.
#define BN_ULLONG uint128_t
#if defined(BORINGSSL_CAN_DIVIDE_UINT128)
#define BN_CAN_DIVIDE_ULLONG
#endif
#endif
#define BN_BITS2 64
#define BN_BITS2_LG 6
#define BN_BYTES 8
#define BN_BITS4 32
#define BN_MASK2 (0xffffffffffffffffUL)
#define BN_MASK2l (0xffffffffUL)
#define BN_MASK2h (0xffffffff00000000UL)
#define BN_MASK2h1 (0xffffffff80000000UL)
#define BN_MONT_CTX_N0_LIMBS 1
#define BN_DEC_CONV (10000000000000000000UL)
#define BN_DEC_NUM 19
#define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
#elif defined(OPENSSL_32_BIT)
#define BN_ULLONG uint64_t
#define BN_CAN_DIVIDE_ULLONG
#define BN_BITS2 32
#define BN_BITS2_LG 5
#define BN_BYTES 4
#define BN_BITS4 16
#define BN_MASK2 (0xffffffffUL)
#define BN_MASK2l (0xffffUL)
#define BN_MASK2h1 (0xffff8000UL)
#define BN_MASK2h (0xffff0000UL)
// On some 32-bit platforms, Montgomery multiplication is done using 64-bit
// arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
// needs to be two words long. Only certain 32-bit platforms actually make use
// of n0[1] and shorter R value would suffice for the others. However,
// currently only the assembly files know which is which.
#define BN_MONT_CTX_N0_LIMBS 2
#define BN_DEC_CONV (1000000000UL)
#define BN_DEC_NUM 9
#define TOBN(hi, lo) (lo), (hi)
#else
#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
#endif
#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
#define BN_CAN_USE_INLINE_ASM
#endif
// MOD_EXP_CTIME_ALIGN is the alignment needed for |BN_mod_exp_mont_consttime|'s
// tables.
//
// TODO(davidben): Historically, this alignment came from cache line
// assumptions, which we've since removed. Is 64-byte alignment still necessary
// or ideal? The true alignment requirement seems to now be 32 bytes, coming
// from RSAZ's use of VMOVDQA to a YMM register. Non-x86_64 has even fewer
// requirements.
#define MOD_EXP_CTIME_ALIGN 64
// MOD_EXP_CTIME_STORAGE_LEN is the number of |BN_ULONG|s needed for the
// |BN_mod_exp_mont_consttime| stack-allocated storage buffer. The buffer is
// just the right size for the RSAZ and is about ~1KB larger than what's
// necessary (4480 bytes) for 1024-bit inputs.
#define MOD_EXP_CTIME_STORAGE_LEN \
(((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG))
#define STATIC_BIGNUM(x) \
{ \
(BN_ULONG *)(x), sizeof(x) / sizeof(BN_ULONG), \
sizeof(x) / sizeof(BN_ULONG), 0, BN_FLG_STATIC_DATA \
}
#if defined(BN_ULLONG)
#define Lw(t) ((BN_ULONG)(t))
#define Hw(t) ((BN_ULONG)((t) >> BN_BITS2))
#endif
#define BN_GENCB_UNSET 0
#define BN_GENCB_NEW_STYLE 1
#define BN_GENCB_OLD_STYLE 2
// bn_minimal_width returns the minimal number of words needed to represent
// |bn|.
int bn_minimal_width(const BIGNUM *bn);
// bn_set_minimal_width sets |bn->width| to |bn_minimal_width(bn)|. If |bn| is
// zero, |bn->neg| is set to zero.
void bn_set_minimal_width(BIGNUM *bn);
// bn_wexpand ensures that |bn| has at least |words| works of space without
// altering its value. It returns one on success or zero on allocation
// failure.
int bn_wexpand(BIGNUM *bn, size_t words);
// bn_expand acts the same as |bn_wexpand|, but takes a number of bits rather
// than a number of words.
int bn_expand(BIGNUM *bn, size_t bits);
// bn_resize_words adjusts |bn->width| to be |words|. It returns one on success
// and zero on allocation error or if |bn|'s value is too large.
OPENSSL_EXPORT int bn_resize_words(BIGNUM *bn, size_t words);
// bn_select_words sets |r| to |a| if |mask| is all ones or |b| if |mask| is
// all zeros.
void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a,
const BN_ULONG *b, size_t num);
// bn_set_words sets |bn| to the value encoded in the |num| words in |words|,
// least significant word first.
int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
// bn_set_static_words acts like |bn_set_words|, but doesn't copy the data. A
// flag is set on |bn| so that |BN_free| won't attempt to free the data.
//
// The |STATIC_BIGNUM| macro is probably a better solution for this outside of
// the FIPS module. Inside of the FIPS module that macro generates rel.ro data,
// which doesn't work with FIPS requirements.
void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
// bn_fits_in_words returns one if |bn| may be represented in |num| words, plus
// a sign bit, and zero otherwise.
int bn_fits_in_words(const BIGNUM *bn, size_t num);
// bn_copy_words copies the value of |bn| to |out| and returns one if the value
// is representable in |num| words. Otherwise, it returns zero.
int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn);
// bn_assert_fits_in_bytes asserts that |bn| fits in |num| bytes. This is a
// no-op in release builds, but triggers an assert in debug builds, and
// declassifies all bytes which are therefore known to be zero in constant-time
// validation.
OPENSSL_EXPORT void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num);
// bn_secret marks |bn|'s contents, but not its width or sign, as secret. See
// |CONSTTIME_SECRET| for details.
OPENSSL_INLINE void bn_secret(BIGNUM *bn) {
CONSTTIME_SECRET(bn->d, bn->width * sizeof(BN_ULONG));
}
// bn_declassify marks |bn|'s value as public. See |CONSTTIME_DECLASSIFY| for
// details.
OPENSSL_INLINE void bn_declassify(BIGNUM *bn) {
CONSTTIME_DECLASSIFY(bn->d, bn->width * sizeof(BN_ULONG));
}
// bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
// the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
// the carry word of the operation. |ap| and |rp| may be equal but otherwise may
// not alias.
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
BN_ULONG w);
// bn_mul_words multiples |ap| by |w| and places the result in |rp|. |ap| and
// |rp| must both be |num| words long. It returns the carry word of the
// operation. |ap| and |rp| may be equal but otherwise may not alias.
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, BN_ULONG w);
// bn_sqr_words sets |rp[2*i]| and |rp[2*i+1]| to |ap[i]|'s square, for all |i|
// up to |num|. |ap| is an array of |num| words and |rp| an array of |2*num|
// words. |ap| and |rp| may not alias.
//
// This gives the contribution of the |ap[i]*ap[i]| terms when squaring |ap|.
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num);
// bn_add_words adds |ap| to |bp| and places the result in |rp|, each of which
// are |num| words long. It returns the carry bit, which is one if the operation
// overflowed and zero otherwise. Any pair of |ap|, |bp|, and |rp| may be equal
// to each other but otherwise may not alias.
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t num);
// bn_sub_words subtracts |bp| from |ap| and places the result in |rp|. It
// returns the borrow bit, which is one if the computation underflowed and zero
// otherwise. Any pair of |ap|, |bp|, and |rp| may be equal to each other but
// otherwise may not alias.
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
size_t num);
// bn_mul_comba4 sets |r| to the product of |a| and |b|.
void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]);
// bn_mul_comba8 sets |r| to the product of |a| and |b|.
void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]);
// bn_sqr_comba8 sets |r| to |a|^2.
void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]);
// bn_sqr_comba4 sets |r| to |a|^2.
void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);
// bn_less_than_words returns one if |a| < |b| and zero otherwise, where |a|
// and |b| both are |len| words long. It runs in constant time.
int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len);
// bn_in_range_words returns one if |min_inclusive| <= |a| < |max_exclusive|,
// where |a| and |max_exclusive| both are |len| words long. |a| and
// |max_exclusive| are treated as secret.
int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len);
// bn_rand_range_words sets |out| to a uniformly distributed random number from
// |min_inclusive| to |max_exclusive|. Both |out| and |max_exclusive| are |len|
// words long.
//
// This function runs in time independent of the result, but |min_inclusive| and
// |max_exclusive| are public data. (Information about the range is unavoidably
// leaked by how many iterations it took to select a number.)
int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len,
const uint8_t additional_data[RAND_PRED_RESISTANCE_LEN]);
// bn_range_secret_range behaves like |BN_rand_range_ex|, but treats
// |max_exclusive| as secret. Because of this constraint, the distribution of
// values returned is more complex.
//
// Rather than repeatedly generating values until one is in range, which would
// leak information, it generates one value. If the value is in range, it sets
// |*out_is_uniform| to one. Otherwise, it sets |*out_is_uniform| to zero,
// fixing up the value to force it in range.
//
// The subset of calls to |bn_rand_secret_range| which set |*out_is_uniform| to
// one are uniformly distributed in the target range. Calls overall are not.
// This function is intended for use in situations where the extra values are
// still usable and where the number of iterations needed to reach the target
// number of uniform outputs may be blinded for negligible probabilities of
// timing leaks.
//
// Although this function treats |max_exclusive| as secret, it treats the number
// of bits in |max_exclusive| as public.
int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
const BIGNUM *max_exclusive);
// BN_MONTGOMERY_MAX_WORDS is the maximum numer of words allowed in a |BIGNUM|
// used with Montgomery reduction. Ideally this limit would be applied to all
// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB
// values for other operations.
#define BN_MONTGOMERY_MAX_WORDS (8 * 1024 / sizeof(BN_ULONG))
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
#define OPENSSL_BN_ASM_MONT
// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
// long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the
// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles
// inputs of this size and zero otherwise.
//
// If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
// If neither is fully-reduced, the output may not be either.
//
// This function allocates |num| words on the stack, so |num| should be at most
// |BN_MONTGOMERY_MAX_WORDS|.
//
// TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
// off upper bits. The aarch64 implementation expects a 64-bit input and does
// not. |size_t| is the safer option but not strictly correct for x86_64. But
// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot.
//
// See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
// inputs.
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#if defined(OPENSSL_X86_64)
OPENSSL_INLINE int bn_mulx_adx_capable(void) {
// MULX is in BMI2.
return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable();
}
int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
OPENSSL_INLINE int bn_mul4x_mont_capable(size_t num) {
return (num >= 8) && ((num & 3) == 0);
}
int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
OPENSSL_INLINE int bn_mulx4x_mont_capable(size_t num) {
return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable();
}
int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
OPENSSL_INLINE int bn_sqr8x_mont_capable(size_t num) {
return (num >= 8) && ((num & 7) == 0);
}
int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#endif // !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
#elif defined(OPENSSL_ARM)
OPENSSL_INLINE int bn_mul8x_mont_neon_capable(size_t num) {
return (num & 7) == 0 && CRYPTO_is_NEON_capable();
}
int bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num);
#endif // defined(OPENSSL_X86_64)
#endif
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
#define OPENSSL_BN_ASM_MONT5
// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
// by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
// words long and represented in Montgomery form. |n0| is a pointer to the
// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
// 16 bytes. |power| must be less than 32 and is treated as secret.
//
// WARNING: This function implements Almost Montgomery Multiplication from
// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
// However, even if they are fully reduced, the output may not be.
void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
const BN_ULONG *table, const BN_ULONG *np,
const BN_ULONG *n0, int num, int power);
// bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
// |table| are |num| words long. |power| must be less than 32 and is treated as
// public. |table| must be 32*|num| words long. |table| must be aligned to at
// least 16 bytes.
void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table,
size_t power);
// bn_gather5 loads index |power| of |table| and stores it in |out|. |out| and
// each entry of |table| are |num| words long. |power| must be less than 32 and
// is treated as secret. |table| must be aligned to at least 16 bytes.
void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
// bn_power5 squares |ap| five times and multiplies it by the value stored at
// index |power| of |table|, modulo |np|. It stores the result in |rp|. The
// values are |num| words long and represented in Montgomery form. |n0| is a
// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
// by 8. |power| must be less than 32 and is treated as secret.
//
// WARNING: This function implements Almost Montgomery Multiplication from
// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
// However, even if they are fully reduced, the output may not be.
void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table,
const BN_ULONG *np, const BN_ULONG *n0, int num, int power);
#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64
uint64_t bn_mont_n0(const BIGNUM *n);
// bn_mont_ctx_set_RR_consttime initializes |mont->RR|. It returns one on
// success and zero on error. |mont->N| and |mont->n0| must have been
// initialized already. The bit width of |mont->N| is assumed public, but
// |mont->N| is otherwise treated as secret.
int bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx);
#if defined(_MSC_VER)
#if defined(OPENSSL_X86_64)
#define BN_UMULT_LOHI(low, high, a, b) ((low) = _umul128((a), (b), &(high)))
#elif defined(OPENSSL_AARCH64)
#define BN_UMULT_LOHI(low, high, a, b) \
do { \
const BN_ULONG _a = (a); \
const BN_ULONG _b = (b); \
(low) = _a * _b; \
(high) = __umulh(_a, _b); \
} while (0)
#endif
#endif // _MSC_VER
#if !defined(BN_ULLONG) && !defined(BN_UMULT_LOHI)
#error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
#endif
// bn_jacobi returns the Jacobi symbol of |a| and |b| (which is -1, 0 or 1), or
// -2 on error.
int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
// bn_is_bit_set_words returns one if bit |bit| is set in |a| and zero
// otherwise.
int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit);
// bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on
// success and zero on error. This function treats the bit width of the modulus
// as public.
int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx);
// bn_less_than_montgomery_R returns one if |bn| is less than the Montgomery R
// value for |mont| and zero otherwise.
int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont);
// bn_mod_u16_consttime returns |bn| mod |d|, ignoring |bn|'s sign bit. It runs
// in time independent of the value of |bn|, but it treats |d| as public.
OPENSSL_EXPORT uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d);
// bn_odd_number_is_obviously_composite returns one if |bn| is divisible by one
// of the first several odd primes and zero otherwise.
int bn_odd_number_is_obviously_composite(const BIGNUM *bn);
// A BN_MILLER_RABIN stores state common to each Miller-Rabin iteration. It is
// initialized within an existing |BN_CTX| scope and may not be used after
// that scope is released with |BN_CTX_end|. Field names match those in FIPS
// 186-4, section C.3.1.
typedef struct {
// w1 is w-1.
BIGNUM *w1;
// m is (w-1)/2^a.
BIGNUM *m;
// one_mont is 1 (mod w) in Montgomery form.
BIGNUM *one_mont;
// w1_mont is w-1 (mod w) in Montgomery form.
BIGNUM *w1_mont;
// w_bits is BN_num_bits(w).
int w_bits;
// a is the largest integer such that 2^a divides w-1.
int a;
} BN_MILLER_RABIN;
// bn_miller_rabin_init initializes |miller_rabin| for testing if |mont->N| is
// prime. It returns one on success and zero on error.
OPENSSL_EXPORT int bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin,
const BN_MONT_CTX *mont, BN_CTX *ctx);
// bn_miller_rabin_iteration performs one Miller-Rabin iteration, checking if
// |b| is a composite witness for |mont->N|. |miller_rabin| must have been
// initialized with |bn_miller_rabin_setup|. On success, it returns one and sets
// |*out_is_possibly_prime| to one if |mont->N| may still be prime or zero if
// |b| shows it is composite. On allocation or internal failure, it returns
// zero.
OPENSSL_EXPORT int bn_miller_rabin_iteration(
const BN_MILLER_RABIN *miller_rabin, int *out_is_possibly_prime,
const BIGNUM *b, const BN_MONT_CTX *mont, BN_CTX *ctx);
// bn_rshift1_words sets |r| to |a| >> 1, where both arrays are |num| bits wide.
void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num);
// bn_rshift_words sets |r| to |a| >> |shift|, where both arrays are |num| bits
// wide.
void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift,
size_t num);
// bn_rshift_secret_shift behaves like |BN_rshift| but runs in time independent
// of both |a| and |n|.
OPENSSL_EXPORT int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a,
unsigned n, BN_CTX *ctx);
// bn_reduce_once sets |r| to |a| mod |m| where 0 <= |a| < 2*|m|. It returns
// zero if |a| < |m| and a mask of all ones if |a| >= |m|. Each array is |num|
// words long, but |a| has an additional word specified by |carry|. |carry| must
// be zero or one, as implied by the bounds on |a|.
//
// |r|, |a|, and |m| may not alias. Use |bn_reduce_once_in_place| if |r| and |a|
// must alias.
BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry,
const BN_ULONG *m, size_t num);
// bn_reduce_once_in_place behaves like |bn_reduce_once| but acts in-place on
// |r|, using |tmp| as scratch space. |r|, |tmp|, and |m| may not alias.
BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m,
BN_ULONG *tmp, size_t num);
// Constant-time non-modular arithmetic.
//
// The following functions implement non-modular arithmetic in constant-time
// and pessimally set |r->width| to the largest possible word size.
//
// Note this means that, e.g., repeatedly multiplying by one will cause widths
// to increase without bound. The corresponding public API functions minimize
// their outputs to avoid regressing calculator consumers.
// bn_uadd_consttime behaves like |BN_uadd|, but it pessimally sets
// |r->width| = |a->width| + |b->width| + 1.
int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
// bn_usub_consttime behaves like |BN_usub|, but it pessimally sets
// |r->width| = |a->width|.
int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
// bn_abs_sub_consttime sets |r| to the absolute value of |a| - |b|, treating
// both inputs as secret. It returns one on success and zero on error.
OPENSSL_EXPORT int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a,
const BIGNUM *b, BN_CTX *ctx);
// bn_mul_consttime behaves like |BN_mul|, but it rejects negative inputs and
// pessimally sets |r->width| to |a->width| + |b->width|, to avoid leaking
// information about |a| and |b|.
int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
// bn_sqrt_consttime behaves like |BN_sqrt|, but it pessimally sets |r->width|
// to 2*|a->width|, to avoid leaking information about |a| and |b|.
int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
// bn_div_consttime behaves like |BN_div|, but it rejects negative inputs and
// treats both inputs, including their magnitudes, as secret. It is, as a
// result, much slower than |BN_div| and should only be used for rare operations
// where Montgomery reduction is not available. |divisor_min_bits| is a
// public lower bound for |BN_num_bits(divisor)|. When |divisor|'s bit width is
// public, this can speed up the operation.
//
// Note that |quotient->width| will be set pessimally to |numerator->width|.
OPENSSL_EXPORT int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder,
const BIGNUM *numerator,
const BIGNUM *divisor,
unsigned divisor_min_bits, BN_CTX *ctx);
// bn_is_relatively_prime checks whether GCD(|x|, |y|) is one. On success, it
// returns one and sets |*out_relatively_prime| to one if the GCD was one and
// zero otherwise. On error, it returns zero.
OPENSSL_EXPORT int bn_is_relatively_prime(int *out_relatively_prime,
const BIGNUM *x, const BIGNUM *y,
BN_CTX *ctx);
// bn_lcm_consttime sets |r| to LCM(|a|, |b|). It returns one and success and
// zero on error. |a| and |b| are both treated as secret.
OPENSSL_EXPORT int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_CTX *ctx);
// bn_mont_ctx_init zero-initialies |mont|.
void bn_mont_ctx_init(BN_MONT_CTX *mont);
// bn_mont_ctx_cleanup releases memory associated with |mont|, without freeing
// |mont| itself.
void bn_mont_ctx_cleanup(BN_MONT_CTX *mont);
// Constant-time modular arithmetic.
//
// The following functions implement basic constant-time modular arithmetic.
// bn_mod_add_words sets |r| to |a| + |b| (mod |m|), using |tmp| as scratch
// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
// |r|, |a|, and |b| may alias.
void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num);
// bn_mod_add_consttime acts like |BN_mod_add_quick| but takes a |BN_CTX|.
int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m, BN_CTX *ctx);
// bn_mod_sub_words sets |r| to |a| - |b| (mod |m|), using |tmp| as scratch
// space. Each array is |num| words long. |a| and |b| must be < |m|. Any pair of
// |r|, |a|, and |b| may alias.
void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
const BN_ULONG *m, BN_ULONG *tmp, size_t num);
// bn_mod_sub_consttime acts like |BN_mod_sub_quick| but takes a |BN_CTX|.
int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m, BN_CTX *ctx);
// bn_mod_lshift1_consttime acts like |BN_mod_lshift1_quick| but takes a
// |BN_CTX|.
int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m,
BN_CTX *ctx);
// bn_mod_lshift_consttime acts like |BN_mod_lshift_quick| but takes a |BN_CTX|.
int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx);
// bn_mod_inverse_consttime sets |r| to |a|^-1, mod |n|. |a| must be non-
// negative and less than |n|. It returns one on success and zero on error. On
// failure, if the failure was caused by |a| having no inverse mod |n| then
// |*out_no_inverse| will be set to one; otherwise it will be set to zero.
//
// This function treats both |a| and |n| as secret, provided they are both non-
// zero and the inverse exists. It should only be used for even moduli where
// none of the less general implementations are applicable.
OPENSSL_EXPORT int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx);
// bn_mod_inverse_prime sets |out| to the modular inverse of |a| modulo |p|,
// computed with Fermat's Little Theorem. It returns one on success and zero on
// error. If |mont_p| is NULL, one will be computed temporarily.
int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p);
// bn_mod_inverse_secret_prime behaves like |bn_mod_inverse_prime| but uses
// |BN_mod_exp_mont_consttime| instead of |BN_mod_exp_mont| in hopes of
// protecting the exponent.
int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p,
BN_CTX *ctx, const BN_MONT_CTX *mont_p);
// BN_MONT_CTX_set_locked takes |lock| and checks whether |*pmont| is NULL. If
// so, it creates a new |BN_MONT_CTX| and sets the modulus for it to |mod|. It
// then stores it as |*pmont|. It returns one on success and zero on error. Note
// this function assumes |mod| is public.
//
// If |*pmont| is already non-NULL then it does nothing and returns one.
int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
const BIGNUM *mod, BN_CTX *bn_ctx);
// Low-level operations for small numbers.
//
// The following functions implement algorithms suitable for use with scalars
// and field elements in elliptic curves. They rely on the number being small
// both to stack-allocate various temporaries and because they do not implement
// optimizations useful for the larger values used in RSA.
// BN_SMALL_MAX_WORDS is the largest size input these functions handle. This
// limit allows temporaries to be more easily stack-allocated. This limit is set
// to accommodate P-521.
#if defined(OPENSSL_32_BIT)
#define BN_SMALL_MAX_WORDS 17
#else
#define BN_SMALL_MAX_WORDS 9
#endif
// bn_mul_small sets |r| to |a|*|b|. |num_r| must be |num_a| + |num_b|. |r| may
// not alias with |a| or |b|.
void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
const BN_ULONG *b, size_t num_b);
// bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|.
// |num_r| must be |num_a|*2. |r| and |a| may not alias.
void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a);
// In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS|
// words long.
// bn_to_montgomery_small sets |r| to |a| translated to the Montgomery domain.
// |r| and |a| are |num| words long, which must be |mont->N.width|. |a| must be
// fully reduced and may alias |r|.
void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_MONT_CTX *mont);
// bn_from_montgomery_small sets |r| to |a| translated out of the Montgomery
// domain. |r| and |a| are |num_r| and |num_a| words long, respectively. |num_r|
// must be |mont->N.width|. |a| must be at most |mont->N|^2 and may alias |r|.
//
// Unlike most of these functions, only |num_r| is bounded by
// |BN_SMALL_MAX_WORDS|. |num_a| may exceed it, but must be at most 2 * |num_r|.
void bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
size_t num_a, const BN_MONT_CTX *mont);
// bn_mod_mul_montgomery_small sets |r| to |a| * |b| mod |mont->N|. Both inputs
// and outputs are in the Montgomery domain. Each array is |num| words long,
// which must be |mont->N.width|. Any two of |r|, |a|, and |b| may alias. |a|
// and |b| must be reduced on input.
void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, size_t num,
const BN_MONT_CTX *mont);
// bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on
// success and zero on programmer or internal error. Both inputs and outputs are
// in the Montgomery domain. |r| and |a| are |num| words long, which must be
// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |num_p|, measured in bits,
// must fit in |size_t|. |a| must be fully-reduced. This function runs in time
// independent of |a|, but |p| and |mont->N| are public values. |a| must be
// fully-reduced and may alias with |r|.
//
// Note this function differs from |BN_mod_exp_mont| which uses Montgomery
// reduction but takes input and output outside the Montgomery domain. Combine
// this function with |bn_from_montgomery_small| and |bn_to_montgomery_small|
// if necessary.
void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_ULONG *p, size_t num_p,
const BN_MONT_CTX *mont);
// bn_mod_inverse0_prime_mont_small sets |r| to |a|^-1 mod |mont->N|. If |a| is
// zero, |r| is set to zero. |mont->N| must be a prime. |r| and |a| are |num|
// words long, which must be |mont->N.width| and at most |BN_SMALL_MAX_WORDS|.
// |a| must be fully-reduced and may alias |r|. This function runs in time
// independent of |a|, but |mont->N| is a public value.
void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a,
size_t num, const BN_MONT_CTX *mont);
// Word-based byte conversion functions.
// bn_big_endian_to_words interprets |in_len| bytes from |in| as a big-endian,
// unsigned integer and writes the result to |out_len| words in |out|. The output
// is in little-endian word order with |out[0]| being the least-significant word.
// |out_len| must be large enough to represent any |in_len|-byte value. That is,
// |in_len| must be at most |BN_BYTES * out_len|.
void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
size_t in_len);
// bn_words_to_big_endian represents |in_len| words from |in| (in little-endian
// word order) as a big-endian, unsigned integer in |out_len| bytes. It writes
// the result to |out|. |out_len| must be large enough to represent |in| without
// truncation.
//
// Note |out_len| may be less than |BN_BYTES * in_len| if |in| is known to have
// leading zeros.
void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in,
size_t in_len);
// bn_little_endian_to_words interprets |in_len| bytes from |in| as a little-endian,
// unsigned integer and writes the result to |out_len| words in |out|. The output
// is in little-endian word order with |out[0]| being the least-significant word.
// |out_len| must be large enough to represent any |in_len|-byte value. That is,
// |out_len| must be at least |BN_BYTES * in_len|.
void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, const size_t in_len);
// bn_words_to_little_endian represents |in_len| words from |in| (in little-endian
// word order) as a little-endian, unsigned integer in |out_len| bytes. It
// writes the result to |out|. |out_len| must be large enough to represent |in|
// without truncation.
//
// Note |out_len| may be less than |BN_BYTES * in_len| if |in| is known to have
// leading zeros.
void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len);
#if defined(__cplusplus)
} // extern C
#endif
#endif // OPENSSL_HEADER_BN_INTERNAL_H

View File

@@ -0,0 +1,97 @@
// Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <openssl/err.h>
#include "internal.h"
// least significant word
#define BN_lsw(n) (((n)->width == 0) ? (BN_ULONG) 0 : (n)->d[0])
int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
// In 'tab', only odd-indexed entries are relevant:
// For any odd BIGNUM n,
// tab[BN_lsw(n) & 7]
// is $(-1)^{(n^2-1)/8}$ (using TeX notation).
// Note that the sign of n does not matter.
static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
// The Jacobi symbol is only defined for odd modulus.
if (!BN_is_odd(b)) {
OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
return -2;
}
// Require b be positive.
if (BN_is_negative(b)) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return -2;
}
int ret = -2;
BN_CTX_start(ctx);
BIGNUM *A = BN_CTX_get(ctx);
BIGNUM *B = BN_CTX_get(ctx);
if (B == NULL) {
goto end;
}
if (!BN_copy(A, a) ||
!BN_copy(B, b)) {
goto end;
}
// Adapted from logic to compute the Kronecker symbol, originally implemented
// according to Henri Cohen, "A Course in Computational Algebraic Number
// Theory" (algorithm 1.4.10).
ret = 1;
while (1) {
// Cohen's step 3:
// B is positive and odd
if (BN_is_zero(A)) {
ret = BN_is_one(B) ? ret : 0;
goto end;
}
// now A is non-zero
int i = 0;
while (!BN_is_bit_set(A, i)) {
i++;
}
if (!BN_rshift(A, A, i)) {
ret = -2;
goto end;
}
if (i & 1) {
// i is odd
// multiply 'ret' by $(-1)^{(B^2-1)/8}$
ret = ret * tab[BN_lsw(B) & 7];
}
// Cohen's step 4:
// multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$
if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) {
ret = -ret;
}
// (A, B) := (B mod |A|, |A|)
if (!BN_nnmod(B, B, A, ctx)) {
ret = -2;
goto end;
}
BIGNUM *tmp = A;
A = B;
B = tmp;
tmp->neg = 0;
}
end:
BN_CTX_end(ctx);
return ret;
}

View File

@@ -0,0 +1,550 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
// Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include <openssl/thread.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../cpucap/internal.h"
#include "../../internal.h"
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE) || \
defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD) || \
defined(OPENSSL_NETBSD) ) && \
defined(OPENSSL_AARCH64) && defined(OPENSSL_BN_ASM_MONT)
#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h"
#define BN_MONTGOMERY_S2N_BIGNUM_CAPABLE 1
OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
// Use s2n-bignum's functions only if
// (1) The ARM architecture has slow multipliers, and
// (2) num (which is the number of words) is multiplie of 8, because
// s2n-bignum's bignum_emontredc_8n requires it, and
// (3) The word size is 64 bits.
// (4) CPU has NEON.
assert(S2NBIGNUM_KSQR_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
S2NBIGNUM_KSQR_32_64_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS &&
S2NBIGNUM_KMUL_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS);
assert(BN_BITS2 == 64);
return !CRYPTO_is_ARMv8_wide_multiplier_capable() &&
(num % 8 == 0) &&
CRYPTO_is_NEON_capable();
}
#else
OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) {
return 0;
}
#endif
void bn_mont_ctx_init(BN_MONT_CTX *mont) {
OPENSSL_memset(mont, 0, sizeof(BN_MONT_CTX));
BN_init(&mont->RR);
BN_init(&mont->N);
}
void bn_mont_ctx_cleanup(BN_MONT_CTX *mont) {
BN_free(&mont->RR);
BN_free(&mont->N);
}
BN_MONT_CTX *BN_MONT_CTX_new(void) {
BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
if (ret == NULL) {
return NULL;
}
bn_mont_ctx_init(ret);
return ret;
}
void BN_MONT_CTX_free(BN_MONT_CTX *mont) {
if (mont == NULL) {
return;
}
bn_mont_ctx_cleanup(mont);
OPENSSL_free(mont);
}
BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, const BN_MONT_CTX *from) {
if (to == from) {
return to;
}
if (!BN_copy(&to->RR, &from->RR) ||
!BN_copy(&to->N, &from->N)) {
return NULL;
}
to->n0[0] = from->n0[0];
to->n0[1] = from->n0[1];
return to;
}
static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) {
if (BN_is_zero(mod)) {
OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO);
return 0;
}
if (!BN_is_odd(mod)) {
OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS);
return 0;
}
if (BN_is_negative(mod)) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
if (!bn_fits_in_words(mod, BN_MONTGOMERY_MAX_WORDS)) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
// Save the modulus.
if (!BN_copy(&mont->N, mod)) {
OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
return 0;
}
// |mont->N| is always stored minimally. Computing RR efficiently leaks the
// size of the modulus. While the modulus may be private in RSA (one of the
// primes), their sizes are public, so this is fine.
bn_set_minimal_width(&mont->N);
// Find n0 such that n0 * N == -1 (mod r).
//
// Only certain BN_BITS2<=32 platforms actually make use of n0[1]. For the
// others, we could use a shorter R value and use faster |BN_ULONG|-based
// math instead of |uint64_t|-based math, which would be double-precision.
// However, currently only the assembler files know which is which.
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
BN_MONT_CTX_N0_LIMBS_value_is_invalid)
OPENSSL_STATIC_ASSERT(
sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t),
uint64_t_is_insufficient_precision_for_n0);
uint64_t n0 = bn_mont_n0(&mont->N);
mont->n0[0] = (BN_ULONG)n0;
#if BN_MONT_CTX_N0_LIMBS == 2
mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2);
#else
mont->n0[1] = 0;
#endif
return 1;
}
int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
if (!bn_mont_ctx_set_N_and_n0(mont, mod)) {
return 0;
}
BN_CTX *new_ctx = NULL;
if (ctx == NULL) {
new_ctx = BN_CTX_new();
if (new_ctx == NULL) {
return 0;
}
ctx = new_ctx;
}
// Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS2 such that R
// > mod. Even though the assembly on some 32-bit platforms works with 64-bit
// values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS *
// BN_BITS2|, is correct because R**2 will still be a multiple of the latter
// as |BN_MONT_CTX_N0_LIMBS| is either one or two.
unsigned lgBigR = mont->N.width * BN_BITS2;
BN_zero(&mont->RR);
int ok = BN_set_bit(&mont->RR, lgBigR * 2) &&
BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) &&
bn_resize_words(&mont->RR, mont->N.width);
BN_CTX_free(new_ctx);
return ok;
}
BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) {
BN_MONT_CTX *mont = BN_MONT_CTX_new();
if (mont == NULL ||
!BN_MONT_CTX_set(mont, mod, ctx)) {
BN_MONT_CTX_free(mont);
return NULL;
}
return mont;
}
BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) {
BN_MONT_CTX *mont = BN_MONT_CTX_new();
if (mont == NULL ||
!bn_mont_ctx_set_N_and_n0(mont, mod) ||
!bn_mont_ctx_set_RR_consttime(mont, ctx)) {
BN_MONT_CTX_free(mont);
return NULL;
}
return mont;
}
int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock,
const BIGNUM *mod, BN_CTX *bn_ctx) {
CRYPTO_MUTEX_lock_read(lock);
BN_MONT_CTX *ctx = *pmont;
CRYPTO_MUTEX_unlock_read(lock);
if (ctx) {
return 1;
}
CRYPTO_MUTEX_lock_write(lock);
if (*pmont == NULL) {
*pmont = BN_MONT_CTX_new_for_modulus(mod, bn_ctx);
}
const int ok = *pmont != NULL;
CRYPTO_MUTEX_unlock_write(lock);
return ok;
}
int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
BN_CTX *ctx) {
return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx);
}
static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a,
size_t num_a, const BN_MONT_CTX *mont) {
const BN_ULONG *n = mont->N.d;
size_t num_n = mont->N.width;
if (num_r != num_n || num_a != 2 * num_n) {
OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
return 0;
}
// Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
// input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
// includes |carry| which is stored separately.
BN_ULONG n0 = mont->n0[0];
BN_ULONG carry = 0;
for (size_t i = 0; i < num_n; i++) {
BN_ULONG v = bn_mul_add_words(a + i, n, num_n, a[i] * n0);
v += carry + a[i + num_n];
carry |= (v != a[i + num_n]);
carry &= (v <= a[i + num_n]);
a[i + num_n] = v;
}
// Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a|
// includes |carry| which is stored separately.
a += num_n;
// |a| thus requires at most one additional subtraction |n| to be reduced.
bn_reduce_once(r, a, carry, n, num_n);
return 1;
}
static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
const BN_MONT_CTX *mont) {
if (r->neg) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
const BIGNUM *n = &mont->N;
if (n->width == 0) {
ret->width = 0;
return 1;
}
int max = 2 * n->width; // carry is stored separately
if (!bn_resize_words(r, max) ||
!bn_wexpand(ret, n->width)) {
return 0;
}
ret->width = n->width;
ret->neg = 0;
return bn_from_montgomery_in_place(ret->d, ret->width, r->d, r->width, mont);
}
int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont,
BN_CTX *ctx) {
int ret = 0;
BIGNUM *t;
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (t == NULL ||
!BN_copy(t, a)) {
goto err;
}
ret = BN_from_montgomery_word(r, t, mont);
err:
BN_CTX_end(ctx);
return ret;
}
int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx) {
// If the high bit of |n| is set, R = 2^(width*BN_BITS2) < 2 * |n|, so we
// compute R - |n| rather than perform Montgomery reduction.
const BIGNUM *n = &mont->N;
if (n->width > 0 && (n->d[n->width - 1] >> (BN_BITS2 - 1)) != 0) {
if (!bn_wexpand(r, n->width)) {
return 0;
}
r->d[0] = 0 - n->d[0];
for (int i = 1; i < n->width; i++) {
r->d[i] = ~n->d[i];
}
r->width = n->width;
r->neg = 0;
return 1;
}
return BN_from_montgomery(r, &mont->RR, mont, ctx);
}
static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a,
const BIGNUM *b,
const BN_MONT_CTX *mont,
BN_CTX *ctx) {
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
if (tmp == NULL) {
goto err;
}
if (a == b) {
if (!bn_sqr_consttime(tmp, a, ctx)) {
goto err;
}
} else {
if (!bn_mul_consttime(tmp, a, b, ctx)) {
goto err;
}
}
// reduce from aRR to aR
if (!BN_from_montgomery_word(r, tmp, mont)) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
#if defined(OPENSSL_BN_ASM_MONT)
// Perform montgomery multiplication using s2n-bignum functions. The arguments
// are equivalent to the arguments of bn_mul_mont.
// montgomery_s2n_bignum_mul_mont works only if num is a multiple of 8.
// montgomery_use_s2n_bignum(num) must be called in advance to check this
// condition, as well as other s2n-bignum requirements.
// For num = 32 or num = 16, this uses faster primitives in s2n-bignum.
// montgomery_s2n_bignum_mul_mont allocates S2NBIGNUM_KMUL_32_64_TEMP_NWORDS +
// 2 * BN_MONTGOMERY_MAX_WORDS uint64_t words at the stack.
static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap,
const BN_ULONG *bp,
const BN_ULONG *np,
const BN_ULONG *n0, size_t num) {
#if defined(BN_MONTGOMERY_S2N_BIGNUM_CAPABLE)
// t is a temporary buffer used by Karatsuba multiplication.
// bignum_kmul_32_64 requires S2NBIGNUM_KMUL_32_64_TEMP_NWORDS words.
uint64_t t[S2NBIGNUM_KMUL_32_64_TEMP_NWORDS];
// mulres is the output buffer of big-int multiplication which uses
// 2 * num elements of mulres. Note that num <= BN_MONTGOMERY_MAX_WORDS
// is guaranteed by the caller (BN_mod_mul_montgomery).
uint64_t mulres[2 * BN_MONTGOMERY_MAX_WORDS];
// Given m the prime number stored at np, m * w = -1 mod 2^64.
uint64_t w = n0[0];
if (num == 32) {
if (ap == bp) {
bignum_ksqr_32_64(mulres, ap, t);
} else {
bignum_kmul_32_64(mulres, ap, bp, t);
}
} else if (num == 16) {
if (ap == bp) {
bignum_ksqr_16_32(mulres, ap, t);
} else {
bignum_kmul_16_32(mulres, ap, bp, t);
}
} else {
if (ap == bp) {
bignum_sqr(num * 2, mulres, num, ap);
} else {
bignum_mul(num * 2, mulres, num, ap, num, bp);
}
}
// Do montgomery reduction. We follow the definition of montgomery reduction
// which is:
// 1. Calculate (mulres + ((mulres mod R) * (-m^-1 mod R) mod R) * m) / R
// using bignum_emontredc_8n, where R is 2^(64*num).
// The calculated result is stored in [mulres+num ... mulres+2*num-1]. If
// the result >= 2^(64*num), bignum_emontredc_8n returns 1.
// 2. Optionally subtract the result if the (result of step 1) >= m.
// The comparison is true if either A or B holds:
// A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n
// returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds.
// B. The result of step 1 fits in 2^(64*num), and the result >= m.
uint64_t c = bignum_emontredc_8n(num, mulres, np, w); // c: case A
c |= bignum_ge(num, mulres + num, num, np); // c: case B
// Optionally subtract and store the result at rp
bignum_optsub(num, rp, mulres + num, c, np);
#else
// Should not call this function unless s2n-bignum is supported.
abort();
#endif
}
#endif
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BN_MONT_CTX *mont, BN_CTX *ctx) {
if (a->neg || b->neg) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
#if defined(OPENSSL_BN_ASM_MONT)
// |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
int num = mont->N.width;
if (num >= (128 / BN_BITS2) &&
a->width == num &&
b->width == num) {
if (!bn_wexpand(r, num)) {
return 0;
}
// This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
// allocates |num| words on the stack, so |num| cannot be too large.
assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
if (montgomery_use_s2n_bignum(num)) {
// Do montgomery multiplication using s2n-bignum.
montgomery_s2n_bignum_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0,
num);
} else {
if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
// The check above ensures this won't happen.
assert(0);
OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
return 0;
}
}
r->neg = 0;
r->width = num;
return 1;
}
#endif
return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx);
}
int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) {
return !BN_is_negative(bn) &&
bn_fits_in_words(bn, mont->N.width);
}
void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num,
const BN_MONT_CTX *mont) {
bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont);
}
void bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a,
size_t num_a, const BN_MONT_CTX *mont) {
if (num_r != (size_t)mont->N.width || num_r > BN_SMALL_MAX_WORDS ||
num_a > 2 * num_r) {
abort();
}
BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2] = {0};
OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG));
if (!bn_from_montgomery_in_place(r, num_r, tmp, 2 * num_r, mont)) {
abort();
}
OPENSSL_cleanse(tmp, 2 * num_r * sizeof(BN_ULONG));
}
void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, size_t num,
const BN_MONT_CTX *mont) {
if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) {
abort();
}
#if defined(OPENSSL_BN_ASM_MONT)
// |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
if (num >= (128 / BN_BITS2)) {
if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
abort(); // The check above ensures this won't happen.
}
return;
}
#endif
// Compute the product.
BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
if (a == b) {
bn_sqr_small(tmp, 2 * num, a, num);
} else {
bn_mul_small(tmp, 2 * num, a, num, b, num);
}
// Reduce.
if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) {
abort();
}
OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG));
}
#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64)
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num)
{
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (ap == bp && bn_sqr8x_mont_capable(num)) {
return bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
}
if (bn_mulx4x_mont_capable(num)) {
return bn_mulx4x_mont(rp, ap, bp, np, n0, num);
}
#endif // !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
if (bn_mul4x_mont_capable(num)) {
return bn_mul4x_mont(rp, ap, bp, np, n0, num);
}
return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif
#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM)
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
if (bn_mul8x_mont_neon_capable(num)) {
return bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
}
return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
}
#endif

View File

@@ -0,0 +1,212 @@
// Copyright 2016 Brian Smith.
// SPDX-License-Identifier: ISC
#include <openssl/bn.h>
#include <assert.h>
#include "internal.h"
#include "../../internal.h"
static uint64_t bn_neg_inv_mod_r_u64(uint64_t n);
OPENSSL_STATIC_ASSERT(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2,
BN_MONT_CTX_N0_LIMBS_value_is_invalid)
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS ==
sizeof(uint64_t),
uint64_t_is_insufficient_precision_for_n0)
// LG_LITTLE_R is log_2(r).
#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2)
uint64_t bn_mont_n0(const BIGNUM *n) {
// These conditions are checked by the caller, |BN_MONT_CTX_set| or
// |BN_MONT_CTX_new_consttime|.
assert(!BN_is_zero(n));
assert(!BN_is_negative(n));
assert(BN_is_odd(n));
// r == 2**(BN_MONT_CTX_N0_LIMBS * BN_BITS2) and LG_LITTLE_R == lg(r). This
// ensures that we can do integer division by |r| by simply ignoring
// |BN_MONT_CTX_N0_LIMBS| limbs. Similarly, we can calculate values modulo
// |r| by just looking at the lowest |BN_MONT_CTX_N0_LIMBS| limbs. This is
// what makes Montgomery multiplication efficient.
//
// As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography
// with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a
// multi-limb Montgomery multiplication of |a * b (mod n)|, given the
// unreduced product |t == a * b|, we repeatedly calculate:
//
// t1 := t % r |t1| is |t|'s lowest limb (see previous paragraph).
// t2 := t1*n0*n
// t3 := t + t2
// t := t3 / r copy all limbs of |t3| except the lowest to |t|.
//
// In the last step, it would only make sense to ignore the lowest limb of
// |t3| if it were zero. The middle steps ensure that this is the case:
//
// t3 == 0 (mod r)
// t + t2 == 0 (mod r)
// t + t1*n0*n == 0 (mod r)
// t1*n0*n == -t (mod r)
// t*n0*n == -t (mod r)
// n0*n == -1 (mod r)
// n0 == -1/n (mod r)
//
// Thus, in each iteration of the loop, we multiply by the constant factor
// |n0|, the negative inverse of n (mod r).
// n_mod_r = n % r. As explained above, this is done by taking the lowest
// |BN_MONT_CTX_N0_LIMBS| limbs of |n|.
uint64_t n_mod_r = n->d[0];
#if BN_MONT_CTX_N0_LIMBS == 2
if (n->width > 1) {
n_mod_r |= (uint64_t)n->d[1] << BN_BITS2;
}
#endif
return bn_neg_inv_mod_r_u64(n_mod_r);
}
// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v|
// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n|
// must be odd.
//
// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery
// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf).
// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and
// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000"
// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21).
//
// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion"
// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is
// constant-time with respect to |n|. We assume uint64_t additions,
// subtractions, shifts, and bitwise operations are all constant time, which
// may be a large leap of faith on 32-bit targets. We avoid division and
// multiplication, which tend to be the most problematic in terms of timing
// leaks.
//
// Most GCD implementations return values such that |u*r + v*n == 1|, so the
// caller would have to negate the resultant |v| for the purpose of Montgomery
// multiplication. This implementation does the negation implicitly by doing
// the computations as a difference instead of a sum.
static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) {
assert(n % 2 == 1);
// alpha == 2**(lg r - 1) == r / 2.
static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1);
const uint64_t beta = n;
uint64_t u = 1;
uint64_t v = 0;
// The invariant maintained from here on is:
// 2**(lg r - i) == u*2*alpha - v*beta.
for (size_t i = 0; i < LG_LITTLE_R; ++i) {
#if BN_BITS2 == 64 && defined(BN_ULLONG)
assert((BN_ULLONG)(1) << (LG_LITTLE_R - i) ==
((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
// Delete a common factor of 2 in u and v if |u| is even. Otherwise, set
// |u = (u + beta) / 2| and |v = (v / 2) + alpha|.
uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0.
// The addition can overflow, so use Dietz's method for it.
//
// Dietz calculates (x+y)/2 by (x⊕y)>>1 + x&y. This is valid for all
// (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values
// (embedded in 64 bits to so that overflow can be ignored):
//
// (declare-fun x () (_ BitVec 64))
// (declare-fun y () (_ BitVec 64))
// (assert (let (
// (one (_ bv1 64))
// (thirtyTwo (_ bv32 64)))
// (and
// (bvult x (bvshl one thirtyTwo))
// (bvult y (bvshl one thirtyTwo))
// (not (=
// (bvadd (bvlshr (bvxor x y) one) (bvand x y))
// (bvlshr (bvadd x y) one)))
// )))
// (check-sat)
uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0.
u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd);
uint64_t alpha_if_u_is_odd = alpha & u_is_odd; // Either |alpha| or 0.
v = (v >> 1) + alpha_if_u_is_odd;
}
// The invariant now shows that u*r - v*n == 1 since r == 2 * alpha.
#if BN_BITS2 == 64 && defined(BN_ULLONG)
declassify_assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta));
#endif
return v;
}
int bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx) {
assert(!BN_is_zero(&mont->N));
assert(!BN_is_negative(&mont->N));
assert(BN_is_odd(&mont->N));
assert(bn_minimal_width(&mont->N) == mont->N.width);
unsigned n_bits = BN_num_bits(&mont->N);
assert(n_bits != 0);
if (n_bits == 1) {
BN_zero(&mont->RR);
return bn_resize_words(&mont->RR, mont->N.width);
}
unsigned lgBigR = mont->N.width * BN_BITS2;
assert(lgBigR >= n_bits);
// RR is R, or 2^lgBigR, in the Montgomery domain. We can compute 2 in the
// Montgomery domain, 2R or 2^(lgBigR+1), and then use Montgomery
// square-and-multiply to exponentiate.
//
// The square steps take 2^n R to (2^n)*(2^n) R = 2^2n R. This is the same as
// doubling 2^n R, n times (doubling any x, n times, computes 2^n * x). When n
// is below some threshold, doubling is faster; when above, squaring is
// faster. From benchmarking various 32-bit and 64-bit architectures, the word
// count seems to work well as a threshold. (Doubling scales linearly and
// Montgomery reduction scales quadratically, so the threshold should scale
// roughly linearly.)
//
// The multiply steps take 2^n R to 2*2^n R = 2^(n+1) R. It is faster to
// double the value instead, so the square-and-multiply exponentiation would
// become square-and-double. However, when using the word count as the
// threshold, it turns out that no multiply/double steps will be needed at
// all, because squaring any x, i times, computes x^(2^i):
//
// (2^threshold)^(2^BN_BITS2_LG) R
// (2^mont->N.width)^BN_BITS2 R
// = 2^(mont->N.width*BN_BITS2) R
// = 2^lgBigR R
// = RR
int threshold = mont->N.width;
// Calculate 2^threshold R = 2^(threshold + lgBigR) by doubling. The
// first n_bits - 1 doubles can be skipped because we don't need to reduce.
if (!BN_set_bit(&mont->RR, n_bits - 1) ||
!bn_mod_lshift_consttime(&mont->RR, &mont->RR,
threshold + (lgBigR - (n_bits - 1)),
&mont->N, ctx)) {
return 0;
}
// The above steps are the same regardless of the threshold. The steps below
// need to be modified if the threshold changes.
assert(threshold == mont->N.width);
for (unsigned i = 0; i < BN_BITS2_LG; i++) {
if (!BN_mod_mul_montgomery(&mont->RR, &mont->RR, &mont->RR, mont, ctx)) {
return 0;
}
}
return bn_resize_words(&mont->RR, mont->N.width);
}

View File

@@ -0,0 +1,692 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../../internal.h"
#define BN_MUL_RECURSIVE_SIZE_NORMAL 16
#define BN_SQR_RECURSIVE_SIZE_NORMAL BN_MUL_RECURSIVE_SIZE_NORMAL
static void bn_abs_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
size_t num, BN_ULONG *tmp) {
BN_ULONG borrow = bn_sub_words(tmp, a, b, num);
bn_sub_words(r, b, a, num);
bn_select_words(r, 0 - borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, num);
}
static void bn_mul_normal(BN_ULONG *r, const BN_ULONG *a, size_t na,
const BN_ULONG *b, size_t nb) {
if (na < nb) {
size_t itmp = na;
na = nb;
nb = itmp;
const BN_ULONG *ltmp = a;
a = b;
b = ltmp;
}
BN_ULONG *rr = &(r[na]);
if (nb == 0) {
OPENSSL_memset(r, 0, na * sizeof(BN_ULONG));
return;
}
rr[0] = bn_mul_words(r, a, na, b[0]);
for (;;) {
if (--nb == 0) {
return;
}
rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
if (--nb == 0) {
return;
}
rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
if (--nb == 0) {
return;
}
rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
if (--nb == 0) {
return;
}
rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
rr += 4;
r += 4;
b += 4;
}
}
// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is
// one if the operation underflowed and zero otherwise. |cl| is the common
// length, that is, the shorter of len(a) or len(b). |dl| is the delta length,
// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or
// cl + abs(dl).
//
// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
// is confusing.
static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int cl, int dl) {
assert(cl >= 0);
BN_ULONG borrow = bn_sub_words(r, a, b, cl);
if (dl == 0) {
return borrow;
}
r += cl;
a += cl;
b += cl;
if (dl < 0) {
// |a| is shorter than |b|. Complete the subtraction as if the excess words
// in |a| were zeros.
dl = -dl;
for (int i = 0; i < dl; i++) {
r[i] = CRYPTO_subc_w(0, b[i], borrow, &borrow);
}
} else {
// |b| is shorter than |a|. Complete the subtraction as if the excess words
// in |b| were zeros.
for (int i = 0; i < dl; i++) {
r[i] = CRYPTO_subc_w(a[i], 0, borrow, &borrow);
}
}
return borrow;
}
// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
// and returning a mask of all ones if the result was negative and all zeros if
// the result was positive. |cl| and |dl| follow the |bn_sub_part_words| calling
// convention.
//
// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
// is confusing.
static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int cl, int dl,
BN_ULONG *tmp) {
BN_ULONG borrow = bn_sub_part_words(tmp, a, b, cl, dl);
bn_sub_part_words(r, b, a, cl, -dl);
int r_len = cl + (dl < 0 ? -dl : dl);
borrow = 0 - borrow;
bn_select_words(r, borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, r_len);
return borrow;
}
int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_CTX *ctx) {
int cl = a->width < b->width ? a->width : b->width;
int dl = a->width - b->width;
int r_len = a->width < b->width ? b->width : a->width;
BN_CTX_start(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
int ok = tmp != NULL &&
bn_wexpand(r, r_len) &&
bn_wexpand(tmp, r_len);
if (ok) {
bn_abs_sub_part_words(r->d, a->d, b->d, cl, dl, tmp->d);
r->width = r_len;
}
BN_CTX_end(ctx);
return ok;
}
// Karatsuba recursive multiplication algorithm
// (cf. Knuth, The Art of Computer Programming, Vol. 2)
// bn_mul_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| has
// length 2*|n2|, |a| has length |n2| + |dna|, |b| has length |n2| + |dnb|, and
// |t| has length 4*|n2|. |n2| must be a power of two. Finally, we must have
// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dna| <= 0 and
// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dnb| <= 0.
//
// TODO(davidben): Simplify and |size_t| the calling convention around lengths
// here.
static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int n2, int dna, int dnb, BN_ULONG *t) {
// |n2| is a power of two.
assert(n2 != 0 && (n2 & (n2 - 1)) == 0);
// Check |dna| and |dnb| are in range.
assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dna && dna <= 0);
assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dnb && dnb <= 0);
// Only call bn_mul_comba 8 if n2 == 8 and the
// two arrays are complete [steve]
if (n2 == 8 && dna == 0 && dnb == 0) {
bn_mul_comba8(r, a, b);
return;
}
// Else do normal multiply
if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
if (dna + dnb < 0) {
OPENSSL_memset(&r[2 * n2 + dna + dnb], 0,
sizeof(BN_ULONG) * -(dna + dnb));
}
return;
}
// Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|.
// Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
// for recursive calls.
// Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
// to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
//
// a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0
//
// Note that we know |n| >= |BN_MUL_RECURSIVE_SIZE_NORMAL|/2 above, so
// |tna| and |tnb| are non-negative.
int n = n2 / 2, tna = n + dna, tnb = n + dnb;
// t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
// their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
// themselves store the absolute value.
BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
// Compute:
// t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
// r0,r1 = a0 * b0
// r2,r3 = a1 * b1
if (n == 4 && dna == 0 && dnb == 0) {
bn_mul_comba4(&t[n2], t, &t[n]);
bn_mul_comba4(r, a, b);
bn_mul_comba4(&r[n2], &a[n], &b[n]);
} else if (n == 8 && dna == 0 && dnb == 0) {
bn_mul_comba8(&t[n2], t, &t[n]);
bn_mul_comba8(r, a, b);
bn_mul_comba8(&r[n2], &a[n], &b[n]);
} else {
BN_ULONG *p = &t[n2 * 2];
bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
bn_mul_recursive(r, a, b, n, 0, 0, p);
bn_mul_recursive(&r[n2], &a[n], &b[n], n, dna, dnb, p);
}
// t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
// t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
// The second term is stored as the absolute value, so we do this with a
// constant-time select.
BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
c = constant_time_select_w(neg, c_neg, c_pos);
// We now have our three components. Add them together.
// r1,r2,c = r1,r2 + t2,t3,c
c += bn_add_words(&r[n], &r[n], &t[n2], n2);
// Propagate the carry bit to the end.
for (int i = n + n2; i < n2 + n2; i++) {
BN_ULONG old = r[i];
r[i] = old + c;
c = r[i] < old;
}
// The product should fit without carries.
declassify_assert(c == 0);
}
// bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r|
// has length 4*|n|, |a| has length |n| + |tna|, |b| has length |n| + |tnb|, and
// |t| has length 8*|n|. |n| must be a power of two. Additionally, we must have
// 0 <= tna < n and 0 <= tnb < n, and |tna| and |tnb| must differ by at most
// one.
//
// TODO(davidben): Make this take |size_t| and perhaps the actual lengths of |a|
// and |b|.
static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int n, int tna, int tnb,
BN_ULONG *t) {
// |n| is a power of two.
assert(n != 0 && (n & (n - 1)) == 0);
// Check |tna| and |tnb| are in range.
assert(0 <= tna && tna < n);
assert(0 <= tnb && tnb < n);
assert(-1 <= tna - tnb && tna - tnb <= 1);
int n2 = n * 2;
if (n < 8) {
bn_mul_normal(r, a, n + tna, b, n + tnb);
OPENSSL_memset(r + n2 + tna + tnb, 0, n2 - tna - tnb);
return;
}
// Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|. |a1|
// and |b1| have size |tna| and |tnb|, respectively.
// Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
// for recursive calls.
// Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1
// to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as:
//
// a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0
// t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR
// their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1
// themselves store the absolute value.
BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]);
neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]);
// Compute:
// t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)|
// r0,r1 = a0 * b0
// r2,r3 = a1 * b1
if (n == 8) {
bn_mul_comba8(&t[n2], t, &t[n]);
bn_mul_comba8(r, a, b);
bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
// |bn_mul_normal| only writes |tna| + |tna| words. Zero the rest.
OPENSSL_memset(&r[n2 + tna + tnb], 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
} else {
BN_ULONG *p = &t[n2 * 2];
bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p);
bn_mul_recursive(r, a, b, n, 0, 0, p);
OPENSSL_memset(&r[n2], 0, sizeof(BN_ULONG) * n2);
if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb);
} else {
int i = n;
for (;;) {
i /= 2;
if (i < tna || i < tnb) {
// E.g., n == 16, i == 8 and tna == 11. |tna| and |tnb| are within one
// of each other, so if |tna| is larger and tna > i, then we know
// tnb >= i, and this call is valid.
bn_mul_part_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
break;
}
if (i == tna || i == tnb) {
// If there is only a bottom half to the number, just do it. We know
// the larger of |tna - i| and |tnb - i| is zero. The other is zero or
// -1 by because of |tna| and |tnb| differ by at most one.
bn_mul_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p);
break;
}
// This loop will eventually terminate when |i| falls below
// |BN_MUL_RECURSIVE_SIZE_NORMAL| because we know one of |tna| and |tnb|
// exceeds that.
}
}
}
// t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1
BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
// t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0.
// The second term is stored as the absolute value, so we do this with a
// constant-time select.
BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2);
BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2);
bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2);
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
c = constant_time_select_w(neg, c_neg, c_pos);
// We now have our three components. Add them together.
// r1,r2,c = r1,r2 + t2,t3,c
c += bn_add_words(&r[n], &r[n], &t[n2], n2);
// Propagate the carry bit to the end.
for (int i = n + n2; i < n2 + n2; i++) {
BN_ULONG old = r[i];
r[i] = old + c;
c = r[i] < old;
}
// The product should fit without carries.
declassify_assert(c == 0);
}
// bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function
// breaks |BIGNUM| invariants and may return a negative zero. This is handled by
// the callers.
static int bn_mul_impl(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_CTX *ctx) {
int al = a->width;
int bl = b->width;
if (al == 0 || bl == 0) {
BN_zero(r);
return 1;
}
int ret = 0;
BIGNUM *rr;
BN_CTX_start(ctx);
if (r == a || r == b) {
rr = BN_CTX_get(ctx);
if (rr == NULL) {
goto err;
}
} else {
rr = r;
}
rr->neg = a->neg ^ b->neg;
int i = al - bl;
if (i == 0) {
if (al == 8) {
if (!bn_wexpand(rr, 16)) {
goto err;
}
rr->width = 16;
bn_mul_comba8(rr->d, a->d, b->d);
goto end;
}
}
int top = al + bl;
static const int kMulNormalSize = 16;
if (al >= kMulNormalSize && bl >= kMulNormalSize) {
if (-1 <= i && i <= 1) {
// Find the largest power of two less than or equal to the larger length.
int j;
if (i >= 0) {
j = BN_num_bits_word((BN_ULONG)al);
} else {
j = BN_num_bits_word((BN_ULONG)bl);
}
j = 1 << (j - 1);
assert(j <= al || j <= bl);
BIGNUM *t = BN_CTX_get(ctx);
if (t == NULL) {
goto err;
}
if (al > j || bl > j) {
// We know |al| and |bl| are at most one from each other, so if al > j,
// bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|.
//
// TODO(davidben): This codepath is almost unused in standard
// algorithms. Is this optimization necessary? See notes in
// https://boringssl-review.googlesource.com/q/I0bd604e2cd6a75c266f64476c23a730ca1721ea6
assert(al >= j && bl >= j);
if (!bn_wexpand(t, j * 8) ||
!bn_wexpand(rr, j * 4)) {
goto err;
}
bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
} else {
// al <= j && bl <= j. Additionally, we know j <= al or j <= bl, so one
// of al - j or bl - j is zero. The other, by the bound on |i| above, is
// zero or -1. Thus, we can use |bn_mul_recursive|.
if (!bn_wexpand(t, j * 4) ||
!bn_wexpand(rr, j * 2)) {
goto err;
}
bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
}
rr->width = top;
goto end;
}
}
if (!bn_wexpand(rr, top)) {
goto err;
}
rr->width = top;
bn_mul_normal(rr->d, a->d, al, b->d, bl);
end:
if (r != rr && !BN_copy(r, rr)) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
if (!bn_mul_impl(r, a, b, ctx)) {
return 0;
}
// This additionally fixes any negative zeros created by |bn_mul_impl|.
bn_set_minimal_width(r);
return 1;
}
int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
// Prevent negative zeros.
if (a->neg || b->neg) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
return bn_mul_impl(r, a, b, ctx);
}
void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a,
const BN_ULONG *b, size_t num_b) {
if (num_r != num_a + num_b) {
abort();
}
// TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not
// hit that code.
if (num_a == 8 && num_b == 8) {
bn_mul_comba8(r, a, b);
} else {
bn_mul_normal(r, a, num_a, b, num_b);
}
}
// tmp must have 2*n words
static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n,
BN_ULONG *tmp) {
if (n == 0) {
return;
}
size_t max = n * 2;
const BN_ULONG *ap = a;
BN_ULONG *rp = r;
rp[0] = rp[max - 1] = 0;
rp++;
// Compute the contribution of a[i] * a[j] for all i < j.
if (n > 1) {
ap++;
rp[n - 1] = bn_mul_words(rp, ap, n - 1, ap[-1]);
rp += 2;
}
if (n > 2) {
for (size_t i = n - 2; i > 0; i--) {
ap++;
rp[i] = bn_mul_add_words(rp, ap, i, ap[-1]);
rp += 2;
}
}
// The final result fits in |max| words, so none of the following operations
// will overflow.
// Double |r|, giving the contribution of a[i] * a[j] for all i != j.
bn_add_words(r, r, r, max);
// Add in the contribution of a[i] * a[i] for all i.
bn_sqr_words(tmp, a, n);
bn_add_words(r, r, tmp, max);
}
// bn_sqr_recursive sets |r| to |a|^2, using |t| as scratch space. |r| has
// length 2*|n2|, |a| has length |n2|, and |t| has length 4*|n2|. |n2| must be
// a power of two.
static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, size_t n2,
BN_ULONG *t) {
// |n2| is a power of two.
assert(n2 != 0 && (n2 & (n2 - 1)) == 0);
if (n2 == 4) {
bn_sqr_comba4(r, a);
return;
}
if (n2 == 8) {
bn_sqr_comba8(r, a);
return;
}
if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
bn_sqr_normal(r, a, n2, t);
return;
}
// Split |a| into a0,a1, each of size |n|.
// Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used
// for recursive calls.
// Split |r| into r0,r1,r2,r3. We must contribute a0^2 to r0,r1, 2*a0*a1 to
// r1,r2, and a1^2 to r2,r3.
size_t n = n2 / 2;
BN_ULONG *t_recursive = &t[n2 * 2];
// t0 = |a0 - a1|.
bn_abs_sub_words(t, a, &a[n], n, &t[n]);
// t2,t3 = t0^2 = |a0 - a1|^2 = a0^2 - 2*a0*a1 + a1^2
bn_sqr_recursive(&t[n2], t, n, t_recursive);
// r0,r1 = a0^2
bn_sqr_recursive(r, a, n, t_recursive);
// r2,r3 = a1^2
bn_sqr_recursive(&r[n2], &a[n], n, t_recursive);
// t0,t1,c = r0,r1 + r2,r3 = a0^2 + a1^2
BN_ULONG c = bn_add_words(t, r, &r[n2], n2);
// t2,t3,c = t0,t1,c - t2,t3 = 2*a0*a1
c -= bn_sub_words(&t[n2], t, &t[n2], n2);
// We now have our three components. Add them together.
// r1,r2,c = r1,r2 + t2,t3,c
c += bn_add_words(&r[n], &r[n], &t[n2], n2);
// Propagate the carry bit to the end.
for (size_t i = n + n2; i < n2 + n2; i++) {
BN_ULONG old = r[i];
r[i] = old + c;
c = r[i] < old;
}
// The square should fit without carries.
assert(c == 0);
}
int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
if (!bn->width) {
return 1;
}
if (w == 0) {
BN_zero(bn);
return 1;
}
BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->width, w);
if (ll) {
if (!bn_wexpand(bn, bn->width + 1)) {
return 0;
}
bn->d[bn->width++] = ll;
}
return 1;
}
int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
int al = a->width;
if (al <= 0) {
r->width = 0;
r->neg = 0;
return 1;
}
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *rr = (a != r) ? r : BN_CTX_get(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
if (!rr || !tmp) {
goto err;
}
int max = 2 * al; // Non-zero (from above)
if (!bn_wexpand(rr, max)) {
goto err;
}
if (al == 4) {
bn_sqr_comba4(rr->d, a->d);
} else if (al == 8) {
bn_sqr_comba8(rr->d, a->d);
} else {
if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
bn_sqr_normal(rr->d, a->d, al, t);
} else {
// If |al| is a power of two, we can use |bn_sqr_recursive|.
if (al != 0 && (al & (al - 1)) == 0) {
if (!bn_wexpand(tmp, al * 4)) {
goto err;
}
bn_sqr_recursive(rr->d, a->d, al, tmp->d);
} else {
if (!bn_wexpand(tmp, max)) {
goto err;
}
bn_sqr_normal(rr->d, a->d, al, tmp->d);
}
}
}
rr->neg = 0;
rr->width = max;
if (rr != r && !BN_copy(r, rr)) {
goto err;
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
if (!bn_sqr_consttime(r, a, ctx)) {
return 0;
}
bn_set_minimal_width(r);
return 1;
}
void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) {
if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) {
abort();
}
if (num_a == 4) {
bn_sqr_comba4(r, a);
} else if (num_a == 8) {
bn_sqr_comba8(r, a);
} else {
BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS];
bn_sqr_normal(r, a, num_a, tmp);
OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG));
}
}

View File

@@ -0,0 +1,988 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
// Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <openssl/err.h>
#include <openssl/mem.h>
#include "internal.h"
#include "../../internal.h"
// kPrimes contains the first 1024 primes.
static const uint16_t kPrimes[] = {
2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37,
41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89,
97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223,
227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359,
367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433,
439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593,
599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743,
751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827,
829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911,
919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997,
1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249,
1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439,
1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601,
1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783,
1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143,
2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347,
2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543,
2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713,
2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119,
3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323,
3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527,
3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697,
3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093,
4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283,
4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513,
4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721,
4723, 4729, 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937,
4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101, 5107, 5113,
5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351,
5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531,
5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743,
5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939,
5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173,
6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359,
6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577, 6581,
6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779, 6781, 6791, 6793, 6803,
6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997,
7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229,
7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487,
7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643, 7649, 7669,
7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879,
7883, 7901, 7907, 7919, 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111,
8117, 8123, 8147, 8161,
};
// BN_prime_checks_for_size returns the number of Miller-Rabin iterations
// necessary for generating a 'bits'-bit candidate prime.
//
//
// This table is generated using the algorithm of FIPS PUB 186-4
// Digital Signature Standard (DSS), section F.1, page 117.
// (https://doi.org/10.6028/NIST.FIPS.186-4)
// The following magma script was used to generate the output:
// securitybits:=125;
// k:=1024;
// for t:=1 to 65 do
// for M:=3 to Floor(2*Sqrt(k-1)-1) do
// S:=0;
// // Sum over m
// for m:=3 to M do
// s:=0;
// // Sum over j
// for j:=2 to m do
// s+:=(RealField(32)!2)^-(j+(k-1)/j);
// end for;
// S+:=2^(m-(m-1)*t)*s;
// end for;
// A:=2^(k-2-M*t);
// B:=8*(Pi(RealField(32))^2-6)/3*2^(k-2)*S;
// pkt:=2.00743*Log(2)*k*2^-k*(A+B);
// seclevel:=Floor(-Log(2,pkt));
// if seclevel ge securitybits then
// printf "k: %5o, security: %o bits (t: %o, M: %o)\n",k,seclevel,t,M;
// break;
// end if;
// end for;
// if seclevel ge securitybits then break; end if;
// end for;
//
// It can be run online at: http://magma.maths.usyd.edu.au/calc
// And will output:
// k: 1024, security: 129 bits (t: 6, M: 23)
// k is the number of bits of the prime, securitybits is the level we want to
// reach.
// prime length | RSA key size | # MR tests | security level
// -------------+--------------|------------+---------------
// (b) >= 6394 | >= 12788 | 3 | 256 bit
// (b) >= 3747 | >= 7494 | 3 | 192 bit
// (b) >= 1345 | >= 2690 | 4 | 128 bit
// (b) >= 1080 | >= 2160 | 5 | 128 bit
// (b) >= 852 | >= 1704 | 5 | 112 bit
// (b) >= 476 | >= 952 | 5 | 80 bit
// (b) >= 400 | >= 800 | 6 | 80 bit
// (b) >= 347 | >= 694 | 7 | 80 bit
// (b) >= 308 | >= 616 | 8 | 80 bit
// (b) >= 55 | >= 110 | 27 | 64 bit
// (b) >= 6 | >= 12 | 34 | 64 bit
static int BN_prime_checks_for_size(int bits) {
if (bits >= 3747) {
return 3;
}
if (bits >= 1345) {
return 4;
}
if (bits >= 476) {
return 5;
}
if (bits >= 400) {
return 6;
}
if (bits >= 347) {
return 7;
}
if (bits >= 308) {
return 8;
}
if (bits >= 55) {
return 27;
}
return 34;
}
// num_trial_division_primes returns the number of primes to try with trial
// division before using more expensive checks. For larger numbers, the value
// of excluding a candidate with trial division is larger.
static size_t num_trial_division_primes(const BIGNUM *n) {
if (n->width * BN_BITS2 > 1024) {
return OPENSSL_ARRAY_SIZE(kPrimes);
}
return OPENSSL_ARRAY_SIZE(kPrimes) / 2;
}
// BN_PRIME_CHECKS_BLINDED is the iteration count for blinding the constant-time
// primality test. See |BN_primality_test| for details. This number is selected
// so that, for a candidate N-bit RSA prime, picking |BN_PRIME_CHECKS_BLINDED|
// random N-bit numbers will have at least |BN_prime_checks_for_size(N)| values
// in range with high probability.
//
// The following Python script computes the blinding factor needed for the
// corresponding iteration count.
/*
import math
# We choose candidate RSA primes between sqrt(2)/2 * 2^N and 2^N and select
# witnesses by generating random N-bit numbers. Thus the probability of
# selecting one in range is at least sqrt(2)/2.
p = math.sqrt(2) / 2
# Target around 2^-8 probability of the blinding being insufficient given that
# key generation is a one-time, noisy operation.
epsilon = 2**-8
def choose(a, b):
r = 1
for i in xrange(b):
r *= a - i
r /= (i + 1)
return r
def failure_rate(min_uniform, iterations):
""" Returns the probability that, for |iterations| candidate witnesses, fewer
than |min_uniform| of them will be uniform. """
prob = 0.0
for i in xrange(min_uniform):
prob += (choose(iterations, i) *
p**i * (1-p)**(iterations - i))
return prob
for min_uniform in (3, 4, 5, 6, 8, 13, 19, 28):
# Find the smallest number of iterations under the target failure rate.
iterations = min_uniform
while True:
prob = failure_rate(min_uniform, iterations)
if prob < epsilon:
print min_uniform, iterations, prob
break
iterations += 1
Output:
3 9 0.00368894873911
4 11 0.00363319494662
5 13 0.00336215573898
6 15 0.00300145783158
8 19 0.00225214119331
13 27 0.00385610026955
19 38 0.0021410539126
28 52 0.00325405801769
16 iterations suffices for 400-bit primes and larger (6 uniform samples needed),
which is already well below the minimum acceptable key size for RSA.
*/
#define BN_PRIME_CHECKS_BLINDED 16
static int probable_prime(BIGNUM *rnd, int bits);
static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
const BIGNUM *rem, BN_CTX *ctx);
static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
const BIGNUM *rem, BN_CTX *ctx);
BN_GENCB *BN_GENCB_new(void) { return OPENSSL_zalloc(sizeof(BN_GENCB)); }
void BN_GENCB_free(BN_GENCB *callback) { OPENSSL_free(callback); }
void BN_GENCB_set(BN_GENCB *callback,
int (*f)(int event, int n, struct bn_gencb_st *),
void *arg) {
callback->type = BN_GENCB_NEW_STYLE;
callback->callback.new_style = f;
callback->arg = arg;
}
void BN_GENCB_set_old(BN_GENCB *callback,
void (*f)(int, int, void *), void *arg) {
callback->type = BN_GENCB_OLD_STYLE;
callback->callback.old_style = f;
callback->arg = arg;
}
int BN_GENCB_call(BN_GENCB *callback, int event, int n) {
if (!callback) {
return 1;
}
if (callback->type == BN_GENCB_NEW_STYLE) {
return callback->callback.new_style(event, n, callback);
} else if (callback->type == BN_GENCB_OLD_STYLE) {
callback->callback.old_style(event, n, callback);
return 1;
} else {
return 0;
}
}
void *BN_GENCB_get_arg(const BN_GENCB *callback) { return callback->arg; }
int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
const BIGNUM *rem, BN_GENCB *cb) {
BIGNUM *t;
int found = 0;
int i, j, c1 = 0;
BN_CTX *ctx;
int checks = BN_prime_checks_for_size(bits);
if (bits < 2) {
// There are no prime numbers this small.
OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL);
return 0;
} else if (bits == 2 && safe) {
// The smallest safe prime (7) is three bits.
OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL);
return 0;
}
ctx = BN_CTX_new();
if (ctx == NULL) {
goto err;
}
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (!t) {
goto err;
}
loop:
// make a random number and set the top and bottom bits
if (add == NULL) {
if (!probable_prime(ret, bits)) {
goto err;
}
} else {
if (safe) {
if (!probable_prime_dh_safe(ret, bits, add, rem, ctx)) {
goto err;
}
} else {
if (!probable_prime_dh(ret, bits, add, rem, ctx)) {
goto err;
}
}
}
if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) {
// aborted
goto err;
}
if (!safe) {
i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
if (i == -1) {
goto err;
} else if (i == 0) {
goto loop;
}
} else {
// for "safe prime" generation, check that (p-1)/2 is prime. Since a prime
// is odd, We just need to divide by 2
if (!BN_rshift1(t, ret)) {
goto err;
}
// Interleave |ret| and |t|'s primality tests to avoid paying the full
// iteration count on |ret| only to quickly discover |t| is composite.
//
// TODO(davidben): This doesn't quite work because an iteration count of 1
// still runs the blinding mechanism.
for (i = 0; i < checks; i++) {
j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, NULL);
if (j == -1) {
goto err;
} else if (j == 0) {
goto loop;
}
j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, NULL);
if (j == -1) {
goto err;
} else if (j == 0) {
goto loop;
}
if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i)) {
goto err;
}
// We have a safe prime test pass
}
}
// we have a prime :-)
found = 1;
err:
if (ctx != NULL) {
BN_CTX_end(ctx);
BN_CTX_free(ctx);
}
return found;
}
static int bn_trial_division(uint16_t *out, const BIGNUM *bn) {
const size_t num_primes = num_trial_division_primes(bn);
for (size_t i = 1; i < num_primes; i++) {
// During RSA key generation, |bn| may be secret, but only if |bn| was
// prime, so it is safe to leak failed trial divisions.
if (constant_time_declassify_int(bn_mod_u16_consttime(bn, kPrimes[i]) ==
0)) {
*out = kPrimes[i];
return 1;
}
}
return 0;
}
int bn_odd_number_is_obviously_composite(const BIGNUM *bn) {
uint16_t prime;
return bn_trial_division(&prime, bn) && !BN_is_word(bn, prime);
}
int bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin, const BN_MONT_CTX *mont,
BN_CTX *ctx) {
// This function corresponds to steps 1 through 3 of FIPS 186-4, C.3.1.
const BIGNUM *w = &mont->N;
// Note we do not call |BN_CTX_start| in this function. We intentionally
// allocate values in the containing scope so they outlive this function.
miller_rabin->w1 = BN_CTX_get(ctx);
miller_rabin->m = BN_CTX_get(ctx);
miller_rabin->one_mont = BN_CTX_get(ctx);
miller_rabin->w1_mont = BN_CTX_get(ctx);
if (miller_rabin->w1 == NULL ||
miller_rabin->m == NULL ||
miller_rabin->one_mont == NULL ||
miller_rabin->w1_mont == NULL) {
return 0;
}
// See FIPS 186-4, C.3.1, steps 1 through 3.
if (!bn_usub_consttime(miller_rabin->w1, w, BN_value_one())) {
return 0;
}
miller_rabin->a = BN_count_low_zero_bits(miller_rabin->w1);
if (!bn_rshift_secret_shift(miller_rabin->m, miller_rabin->w1,
miller_rabin->a, ctx)) {
return 0;
}
miller_rabin->w_bits = BN_num_bits(w);
// Precompute some values in Montgomery form.
if (!bn_one_to_montgomery(miller_rabin->one_mont, mont, ctx) ||
// w - 1 is -1 mod w, so we can compute it in the Montgomery domain, -R,
// with a subtraction. (|one_mont| cannot be zero.)
!bn_usub_consttime(miller_rabin->w1_mont, w, miller_rabin->one_mont)) {
return 0;
}
return 1;
}
int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin,
int *out_is_possibly_prime, const BIGNUM *b,
const BN_MONT_CTX *mont, BN_CTX *ctx) {
// This function corresponds to steps 4.3 through 4.5 of FIPS 186-4, C.3.1.
int ret = 0;
BN_CTX_start(ctx);
// Step 4.3. We use Montgomery-encoding for better performance and to avoid
// timing leaks.
const BIGNUM *w = &mont->N;
BIGNUM *z = BN_CTX_get(ctx);
if (z == NULL ||
!BN_mod_exp_mont_consttime(z, b, miller_rabin->m, w, ctx, mont) ||
!BN_to_montgomery(z, z, mont, ctx)) {
goto err;
}
// is_possibly_prime is all ones if we have determined |b| is not a composite
// witness for |w|. This is equivalent to going to step 4.7 in the original
// algorithm. To avoid timing leaks, we run the algorithm to the end for prime
// inputs.
crypto_word_t is_possibly_prime = 0;
// Step 4.4. If z = 1 or z = w-1, b is not a composite witness and w is still
// possibly prime.
is_possibly_prime = BN_equal_consttime(z, miller_rabin->one_mont) |
BN_equal_consttime(z, miller_rabin->w1_mont);
is_possibly_prime = 0 - is_possibly_prime; // Make it all zeros or all ones.
// Step 4.5.
//
// To avoid leaking |a|, we run the loop to |w_bits| and mask off all
// iterations once |j| = |a|.
for (int j = 1; j < miller_rabin->w_bits; j++) {
if (constant_time_declassify_w(constant_time_eq_int(j, miller_rabin->a) &
~is_possibly_prime)) {
// If the loop is done and we haven't seen z = 1 or z = w-1 yet, the
// value is composite and we can break in variable time.
break;
}
// Step 4.5.1.
if (!BN_mod_mul_montgomery(z, z, z, mont, ctx)) {
goto err;
}
// Step 4.5.2. If z = w-1 and the loop is not done, this is not a composite
// witness.
crypto_word_t z_is_w1_mont = BN_equal_consttime(z, miller_rabin->w1_mont);
z_is_w1_mont = 0 - z_is_w1_mont; // Make it all zeros or all ones.
is_possibly_prime |= z_is_w1_mont; // Go to step 4.7 if |z_is_w1_mont|.
// Step 4.5.3. If z = 1 and the loop is not done, the previous value of z
// was not -1. There are no non-trivial square roots of 1 modulo a prime, so
// w is composite and we may exit in variable time.
if (constant_time_declassify_w(
BN_equal_consttime(z, miller_rabin->one_mont) &
~is_possibly_prime)) {
break;
}
}
*out_is_possibly_prime = constant_time_declassify_w(is_possibly_prime) & 1;
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks,
BN_CTX *ctx, int do_trial_division, BN_GENCB *cb) {
// This function's secrecy and performance requirements come from RSA key
// generation. We generate RSA keys by selecting two large, secret primes with
// rejection sampling.
//
// We thus treat |w| as secret if turns out to be a large prime. However, if
// |w| is composite, we treat this and |w| itself as public. (Conversely, if
// |w| is prime, that it is prime is public. Only the value is secret.) This
// is fine for RSA key generation, but note it is important that we use
// rejection sampling, with each candidate prime chosen independently. This
// would not work for, e.g., an algorithm which looked for primes in
// consecutive integers. These assumptions allow us to discard composites
// quickly. We additionally treat |w| as public when it is a small prime to
// simplify trial decryption and some edge cases.
//
// One RSA key generation will call this function on exactly two primes and
// many more composites. The overall cost is a combination of several factors:
//
// 1. Checking if |w| is divisible by a small prime is much faster than
// learning it is composite by Miller-Rabin (see below for details on that
// cost). Trial division by p saves 1/p of Miller-Rabin calls, so this is
// worthwhile until p exceeds the ratio of the two costs.
//
// 2. For a random (i.e. non-adversarial) candidate large prime and candidate
// witness, the probability of false witness is very low. (This is why FIPS
// 186-4 only requires a few iterations.) Thus composites not discarded by
// trial decryption, in practice, cost one Miller-Rabin iteration. Only the
// two actual primes cost the full iteration count.
//
// 3. A Miller-Rabin iteration is a modular exponentiation plus |a| additional
// modular squares, where |a| is the number of factors of two in |w-1|. |a|
// is likely small (the distribution falls exponentially), but it is also
// potentially secret, so we loop up to its log(w) upper bound when |w| is
// prime. When |w| is composite, we break early, so only two calls pay this
// cost. (Note that all calls pay the modular exponentiation which is,
// itself, log(w) modular multiplications and squares.)
//
// 4. While there are only two prime calls, they multiplicatively pay the full
// costs of (2) and (3).
//
// 5. After the primes are chosen, RSA keys derive some values from the
// primes, but this cost is negligible in comparison.
*out_is_probably_prime = 0;
if (BN_cmp(w, BN_value_one()) <= 0) {
return 1;
}
if (!BN_is_odd(w)) {
// The only even prime is two.
*out_is_probably_prime = BN_is_word(w, 2);
return 1;
}
// Miller-Rabin does not work for three.
if (BN_is_word(w, 3)) {
*out_is_probably_prime = 1;
return 1;
}
if (do_trial_division) {
// Perform additional trial division checks to discard small primes.
uint16_t prime;
if (bn_trial_division(&prime, w)) {
*out_is_probably_prime = BN_is_word(w, prime);
return 1;
}
if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, -1)) {
return 0;
}
}
if (checks == BN_prime_checks_for_generation) {
checks = BN_prime_checks_for_size(BN_num_bits(w));
}
BN_CTX *new_ctx = NULL;
if (ctx == NULL) {
new_ctx = BN_CTX_new();
if (new_ctx == NULL) {
return 0;
}
ctx = new_ctx;
}
// See C.3.1 from FIPS 186-4.
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *b = BN_CTX_get(ctx);
BN_MONT_CTX *mont = BN_MONT_CTX_new_consttime(w, ctx);
BN_MILLER_RABIN miller_rabin;
if (b == NULL || mont == NULL ||
// Steps 1-3.
!bn_miller_rabin_init(&miller_rabin, mont, ctx)) {
goto err;
}
// The following loop performs in inner iteration of the Miller-Rabin
// Primality test (Step 4).
//
// The algorithm as specified in FIPS 186-4 leaks information on |w|, the RSA
// private key. Instead, we run through each iteration unconditionally,
// performing modular multiplications, masking off any effects to behave
// equivalently to the specified algorithm.
//
// We also blind the number of values of |b| we try. Steps 4.14.2 say to
// discard out-of-range values. To avoid leaking information on |w|, we use
// |bn_rand_secret_range| which, rather than discarding bad values, adjusts
// them to be in range. Though not uniformly selected, these adjusted values
// are still usable as Miller-Rabin checks.
//
// Miller-Rabin is already probabilistic, so we could reach the desired
// confidence levels by just suitably increasing the iteration count. However,
// to align with FIPS 186-4, we use a more pessimal analysis: we do not count
// the non-uniform values towards the iteration count. As a result, this
// function is more complex and has more timing risk than necessary.
//
// We count both total iterations and uniform ones and iterate until we've
// reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively.
// If the latter is large enough, it will be the limiting factor with high
// probability and we won't leak information.
//
// Note this blinding does not impact most calls when picking primes because
// composites are rejected early. Only the two secret primes see extra work.
crypto_word_t uniform_iterations = 0;
// Using |constant_time_lt_w| seems to prevent the compiler from optimizing
// this into two jumps.
for (int i = 1; constant_time_declassify_w(
(i <= BN_PRIME_CHECKS_BLINDED) |
constant_time_lt_w(uniform_iterations, checks));
i++) {
// Step 4.1-4.2
int is_uniform;
if (!bn_rand_secret_range(b, &is_uniform, 2, miller_rabin.w1)) {
goto err;
}
uniform_iterations += is_uniform;
// Steps 4.3-4.5
int is_possibly_prime = 0;
if (!bn_miller_rabin_iteration(&miller_rabin, &is_possibly_prime, b, mont,
ctx)) {
goto err;
}
if (!is_possibly_prime) {
// Step 4.6. We did not see z = w-1 before z = 1, so w must be composite.
*out_is_probably_prime = 0;
ret = 1;
goto err;
}
// Step 4.7
if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) {
goto err;
}
}
declassify_assert(uniform_iterations >= (crypto_word_t)checks);
*out_is_probably_prime = 1;
ret = 1;
err:
BN_MONT_CTX_free(mont);
BN_CTX_end(ctx);
BN_CTX_free(new_ctx);
return ret;
}
int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx,
BN_GENCB *cb) {
return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb);
}
int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx,
int do_trial_division, BN_GENCB *cb) {
int is_probably_prime;
if (!BN_primality_test(&is_probably_prime, a, checks, ctx, do_trial_division,
cb)) {
return -1;
}
return is_probably_prime;
}
int BN_enhanced_miller_rabin_primality_test(
enum bn_primality_result_t *out_result, const BIGNUM *w, int checks,
BN_CTX *ctx, BN_GENCB *cb) {
// Enhanced Miller-Rabin is only valid on odd integers greater than 3.
if (!BN_is_odd(w) || BN_cmp_word(w, 3) <= 0) {
OPENSSL_PUT_ERROR(BN, BN_R_INVALID_INPUT);
return 0;
}
if (checks == BN_prime_checks_for_generation) {
checks = BN_prime_checks_for_size(BN_num_bits(w));
}
int ret = 0;
BN_MONT_CTX *mont = NULL;
BN_CTX_start(ctx);
BIGNUM *w1 = BN_CTX_get(ctx);
if (w1 == NULL ||
!BN_copy(w1, w) ||
!BN_sub_word(w1, 1)) {
goto err;
}
// Write w1 as m*2^a (Steps 1 and 2).
int a = 0;
while (!BN_is_bit_set(w1, a)) {
a++;
}
BIGNUM *m = BN_CTX_get(ctx);
if (m == NULL ||
!BN_rshift(m, w1, a)) {
goto err;
}
BIGNUM *b = BN_CTX_get(ctx);
BIGNUM *g = BN_CTX_get(ctx);
BIGNUM *z = BN_CTX_get(ctx);
BIGNUM *x = BN_CTX_get(ctx);
BIGNUM *x1 = BN_CTX_get(ctx);
if (b == NULL ||
g == NULL ||
z == NULL ||
x == NULL ||
x1 == NULL) {
goto err;
}
// Montgomery setup for computations mod w
mont = BN_MONT_CTX_new_for_modulus(w, ctx);
if (mont == NULL) {
goto err;
}
// The following loop performs in inner iteration of the Enhanced Miller-Rabin
// Primality test (Step 4).
for (int i = 1; i <= checks; i++) {
// Step 4.1-4.2
if (!BN_rand_range_ex(b, 2, w1)) {
goto err;
}
// Step 4.3-4.4
if (!BN_gcd(g, b, w, ctx)) {
goto err;
}
if (BN_cmp_word(g, 1) > 0) {
*out_result = bn_composite;
ret = 1;
goto err;
}
// Step 4.5
if (!BN_mod_exp_mont(z, b, m, w, ctx, mont)) {
goto err;
}
// Step 4.6
if (BN_is_one(z) || BN_cmp(z, w1) == 0) {
goto loop;
}
// Step 4.7
for (int j = 1; j < a; j++) {
if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) {
goto err;
}
if (BN_cmp(z, w1) == 0) {
goto loop;
}
if (BN_is_one(z)) {
goto composite;
}
}
// Step 4.8-4.9
if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) {
goto err;
}
// Step 4.10-4.11
if (!BN_is_one(z) && !BN_copy(x, z)) {
goto err;
}
composite:
// Step 4.12-4.14
if (!BN_copy(x1, x) ||
!BN_sub_word(x1, 1) ||
!BN_gcd(g, x1, w, ctx)) {
goto err;
}
if (BN_cmp_word(g, 1) > 0) {
*out_result = bn_composite;
} else {
*out_result = bn_non_prime_power_composite;
}
ret = 1;
goto err;
loop:
// Step 4.15
if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) {
goto err;
}
}
*out_result = bn_probably_prime;
ret = 1;
err:
BN_MONT_CTX_free(mont);
BN_CTX_end(ctx);
return ret;
}
static int probable_prime(BIGNUM *rnd, int bits) {
do {
if (!BN_rand(rnd, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD)) {
return 0;
}
} while (bn_odd_number_is_obviously_composite(rnd));
return 1;
}
static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
const BIGNUM *rem, BN_CTX *ctx) {
int ret = 0;
BIGNUM *t1;
BN_CTX_start(ctx);
if ((t1 = BN_CTX_get(ctx)) == NULL) {
goto err;
}
if (!BN_rand(rnd, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) {
goto err;
}
// we need ((rnd-rem) % add) == 0
if (!BN_mod(t1, rnd, add, ctx)) {
goto err;
}
if (!BN_sub(rnd, rnd, t1)) {
goto err;
}
if (rem == NULL) {
if (!BN_add_word(rnd, 1)) {
goto err;
}
} else {
if (!BN_add(rnd, rnd, rem)) {
goto err;
}
}
// we now have a random number 'rand' to test.
const size_t num_primes = num_trial_division_primes(rnd);
loop:
for (size_t i = 1; i < num_primes; i++) {
// check that rnd is a prime
if (bn_mod_u16_consttime(rnd, kPrimes[i]) <= 1) {
if (!BN_add(rnd, rnd, add)) {
goto err;
}
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
const BIGNUM *rem, BN_CTX *ctx) {
int ret = 0;
BIGNUM *t1, *qadd, *q;
bits--;
BN_CTX_start(ctx);
t1 = BN_CTX_get(ctx);
q = BN_CTX_get(ctx);
qadd = BN_CTX_get(ctx);
if (qadd == NULL) {
goto err;
}
if (!BN_rshift1(qadd, padd)) {
goto err;
}
if (!BN_rand(q, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) {
goto err;
}
// we need ((rnd-rem) % add) == 0
if (!BN_mod(t1, q, qadd, ctx)) {
goto err;
}
if (!BN_sub(q, q, t1)) {
goto err;
}
if (rem == NULL) {
if (!BN_add_word(q, 1)) {
goto err;
}
} else {
if (!BN_rshift1(t1, rem)) {
goto err;
}
if (!BN_add(q, q, t1)) {
goto err;
}
}
// we now have a random number 'rand' to test.
if (!BN_lshift1(p, q)) {
goto err;
}
if (!BN_add_word(p, 1)) {
goto err;
}
const size_t num_primes = num_trial_division_primes(p);
loop:
for (size_t i = 1; i < num_primes; i++) {
// check that p and q are prime
// check that for p and q
// gcd(p-1,primes) == 1 (except for 2)
if (bn_mod_u16_consttime(p, kPrimes[i]) == 0 ||
bn_mod_u16_consttime(q, kPrimes[i]) == 0) {
if (!BN_add(p, p, padd)) {
goto err;
}
if (!BN_add(q, q, qadd)) {
goto err;
}
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}

View File

@@ -0,0 +1,267 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
// Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <limits.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/rand.h>
#include <openssl/type_check.h>
#include "internal.h"
#include "../../internal.h"
int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) {
if (rnd == NULL) {
return 0;
}
if (top != BN_RAND_TOP_ANY && top != BN_RAND_TOP_ONE &&
top != BN_RAND_TOP_TWO) {
OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
return 0;
}
if (bottom != BN_RAND_BOTTOM_ANY && bottom != BN_RAND_BOTTOM_ODD) {
OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
return 0;
}
if (bits == 0) {
BN_zero(rnd);
return 1;
}
if (bits > INT_MAX - (BN_BITS2 - 1)) {
OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG);
return 0;
}
int words = (bits + BN_BITS2 - 1) / BN_BITS2;
int bit = (bits - 1) % BN_BITS2;
const BN_ULONG kOne = 1;
const BN_ULONG kThree = 3;
BN_ULONG mask = bit < BN_BITS2 - 1 ? (kOne << (bit + 1)) - 1 : BN_MASK2;
if (!bn_wexpand(rnd, words)) {
return 0;
}
// |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
// functions to avoid updating the service indicator with the DRBG functions.
FIPS_service_indicator_lock_state();
AWSLC_ABORT_IF_NOT_ONE(RAND_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)));
FIPS_service_indicator_unlock_state();
rnd->d[words - 1] &= mask;
if (top != BN_RAND_TOP_ANY) {
if (top == BN_RAND_TOP_TWO && bits > 1) {
if (bit == 0) {
rnd->d[words - 1] |= 1;
rnd->d[words - 2] |= kOne << (BN_BITS2 - 1);
} else {
rnd->d[words - 1] |= kThree << (bit - 1);
}
} else {
rnd->d[words - 1] |= kOne << bit;
}
}
if (bottom == BN_RAND_BOTTOM_ODD) {
rnd->d[0] |= 1;
}
rnd->neg = 0;
rnd->width = words;
return 1;
}
int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) {
return BN_rand(rnd, bits, top, bottom);
}
// bn_less_than_word_mask returns a mask of all ones if the number represented
// by |len| words at |a| is less than |b| and zero otherwise. It performs this
// computation in time independent of the value of |a|. |b| is assumed public.
static crypto_word_t bn_less_than_word_mask(const BN_ULONG *a, size_t len,
BN_ULONG b) {
if (b == 0) {
return CONSTTIME_FALSE_W;
}
if (len == 0) {
return CONSTTIME_TRUE_W;
}
// |a| < |b| iff a[1..len-1] are all zero and a[0] < b.
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
crypto_word_t mask = 0;
for (size_t i = 1; i < len; i++) {
mask |= a[i];
}
// |mask| is now zero iff a[1..len-1] are all zero.
mask = constant_time_is_zero_w(mask);
mask &= constant_time_lt_w(a[0], b);
return mask;
}
int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len) {
crypto_word_t mask = ~bn_less_than_word_mask(a, len, min_inclusive);
return mask & bn_less_than_words(a, max_exclusive, len);
}
static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask,
size_t min_inclusive, const BN_ULONG *max_exclusive,
size_t len) {
// The magnitude of |max_exclusive| is assumed public.
size_t words = len;
while (words > 0 && max_exclusive[words - 1] == 0) {
words--;
}
if (words == 0 ||
(words == 1 && max_exclusive[0] <= min_inclusive)) {
OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE);
return 0;
}
BN_ULONG mask = max_exclusive[words - 1];
// This sets all bits in |mask| below the most significant bit.
mask |= mask >> 1;
mask |= mask >> 2;
mask |= mask >> 4;
mask |= mask >> 8;
mask |= mask >> 16;
#if defined(OPENSSL_64_BIT)
mask |= mask >> 32;
#endif
*out_words = words;
*out_mask = mask;
return 1;
}
int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive,
const BN_ULONG *max_exclusive, size_t len,
const uint8_t additional_data[RAND_PRED_RESISTANCE_LEN]) {
// This function implements the equivalent of steps 4 through 7 of FIPS 186-4
// appendices B.4.2 and B.5.2. When called in those contexts, |max_exclusive|
// is n and |min_inclusive| is one.
// |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
// functions to avoid updating the service indicator with the DRBG functions.
FIPS_service_indicator_lock_state();
int ret = 0;
// Compute the bit length of |max_exclusive| (step 1), in terms of a number of
// |words| worth of entropy to fill and a mask of bits to clear in the top
// word.
size_t words;
BN_ULONG mask;
if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive, len)) {
goto end;
}
// Fill any unused words with zero.
OPENSSL_memset(out + words, 0, (len - words) * sizeof(BN_ULONG));
unsigned count = 100;
do {
if (!--count) {
OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS);
goto end;
}
// Steps 4 and 5. Use |words| and |mask| together to obtain a string of N
// bits, where N is the bit length of |max_exclusive|.
RAND_bytes_with_user_prediction_resistance((uint8_t *)out, words * sizeof(BN_ULONG),
additional_data);
out[words - 1] &= mask;
// If out >= max_exclusive or out < min_inclusive, retry. This implements
// the equivalent of steps 6 and 7 without leaking the value of |out|. The
// result of this comparison may be treated as public. It only reveals how
// many attempts were needed before we found a value in range. This is
// independent of the final secret output, and has a distribution that
// depends only on |min_inclusive| and |max_exclusive|, both of which are
// public.
} while (!constant_time_declassify_int(
bn_in_range_words(out, min_inclusive, max_exclusive, words)));
ret = 1;
end:
FIPS_service_indicator_unlock_state();
return ret;
}
int BN_rand_range_ex(BIGNUM *r, BN_ULONG min_inclusive,
const BIGNUM *max_exclusive) {
static const uint8_t kDefaultAdditionalData[RAND_PRED_RESISTANCE_LEN] = {0};
if (!bn_wexpand(r, max_exclusive->width) ||
!bn_rand_range_words(r->d, min_inclusive, max_exclusive->d,
max_exclusive->width, kDefaultAdditionalData)) {
return 0;
}
r->neg = 0;
r->width = max_exclusive->width;
return 1;
}
int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive,
const BIGNUM *max_exclusive) {
// |RAND_bytes| calls within the fipsmodule should be wrapped with state lock
// functions to avoid updating the service indicator with the DRBG functions.
FIPS_service_indicator_lock_state();
int ret = 0;
size_t words;
BN_ULONG mask;
if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive->d,
max_exclusive->width) ||
!bn_wexpand(r, words)) {
goto end;
}
assert(words > 0);
assert(mask != 0);
// The range must be large enough for bit tricks to fix invalid values.
if (words == 1 && min_inclusive > mask >> 1) {
OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE);
goto end;
}
// Select a uniform random number with num_bits(max_exclusive) bits.
AWSLC_ABORT_IF_NOT_ONE(RAND_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)));
r->d[words - 1] &= mask;
// Check, in constant-time, if the value is in range.
*out_is_uniform =
bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words);
crypto_word_t in_range = *out_is_uniform;
in_range = 0 - in_range;
// If the value is not in range, force it to be in range.
r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive);
r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1);
declassify_assert(
bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words));
r->neg = 0;
r->width = (int)words;
ret = 1;
end:
FIPS_service_indicator_unlock_state();
return ret;
}
int BN_rand_range(BIGNUM *r, const BIGNUM *range) {
return BN_rand_range_ex(r, 0, range);
}
int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) {
return BN_rand_range(r, range);
}

View File

@@ -0,0 +1,130 @@
// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2012, Intel Corporation. All Rights Reserved.
//
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
// (2) University of Haifa, Israel
//
// SPDX-License-Identifier: Apache-2.0
#include "rsaz_exp.h"
#if defined(RSAZ_ENABLED)
#include <openssl/mem.h>
#include "internal.h"
#include "../../internal.h"
// rsaz_one is 1 in RSAZ's representation.
alignas(64) static const BN_ULONG rsaz_one[40] = {
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// rsaz_two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is
// 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22).
alignas(64) static const BN_ULONG rsaz_two80[40] = {
0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
const BN_ULONG base_norm[16],
const BN_ULONG exponent[16],
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
BN_ULONG k0,
BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) {
OPENSSL_STATIC_ASSERT(MOD_EXP_CTIME_ALIGN % 64 == 0,
MOD_EXP_CTIME_ALIGN_is_too_small)
assert((uintptr_t)storage % 64 == 0);
BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s;
// Note |R2| aliases |table_s|.
if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) {
result = storage;
a_inv = storage + 40;
m = storage + 40 * 2; // should not cross page
} else {
m = storage; // should not cross page
result = storage + 40;
a_inv = storage + 40 * 2;
}
rsaz_1024_norm2red_avx2(m, m_norm);
rsaz_1024_norm2red_avx2(a_inv, base_norm);
rsaz_1024_norm2red_avx2(R2, RR);
// Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix,
// giving R = 2^(36*29) = 2^1044.
rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
// R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052
rsaz_1024_mul_avx2(R2, R2, rsaz_two80, m, k0);
// R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2
// table[0] = 1
// table[1] = a_inv^1
rsaz_1024_mul_avx2(result, R2, rsaz_one, m, k0);
rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 0);
rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
// table[2] = a_inv^2
rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 2);
// table[4] = a_inv^4
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 4);
// table[8] = a_inv^8
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 8);
// table[16] = a_inv^16
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 16);
for (int i = 3; i < 32; i += 2) {
// table[i] = table[i-1] * a_inv = a_inv^i
rsaz_1024_gather5_avx2(result, table_s, i - 1);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, i);
for (int j = 2 * i; j < 32; j *= 2) {
// table[j] = table[j/2]^2 = a_inv^j
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, j);
}
}
// Load the first window.
const uint8_t *p_str = (const uint8_t *)exponent;
int wvalue = p_str[127] >> 3;
rsaz_1024_gather5_avx2(result, table_s, wvalue);
int index = 1014;
while (index > -1) { // Loop for the remaining 127 windows.
rsaz_1024_sqr_avx2(result, result, m, k0, 5);
uint16_t wvalue_16;
memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16));
wvalue = wvalue_16;
wvalue = (wvalue >> (index % 8)) & 31;
index -= 5;
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|.
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
}
// Square four times.
rsaz_1024_sqr_avx2(result, result, m, k0, 4);
wvalue = p_str[0] & 15;
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|.
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
// Convert from Montgomery.
rsaz_1024_mul_avx2(result, result, rsaz_one, m, k0);
rsaz_1024_red2norm_avx2(result_norm, result);
BN_ULONG scratch[16];
bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, scratch, 16);
OPENSSL_cleanse(storage, MOD_EXP_CTIME_STORAGE_LEN * sizeof(BN_ULONG));
}
#endif // RSAZ_ENABLED

View File

@@ -0,0 +1,324 @@
// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2012, Intel Corporation. All Rights Reserved.
//
// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
// (1) Intel Corporation, Israel Development Center, Haifa, Israel
// (2) University of Haifa, Israel
//
// SPDX-License-Identifier: Apache-2.0
#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H
#define OPENSSL_HEADER_BN_RSAZ_EXP_H
#include <openssl/bn.h>
#include "internal.h"
#include "../../internal.h"
#include "../cpucap/internal.h"
#if defined(__cplusplus)
extern "C" {
#endif
#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
!defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
#define RSAZ_ENABLED
// RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent|
// modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have
// the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|,
// respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a
// temporary buffer that must be aligned to |MOD_EXP_CTIME_ALIGN| bytes.
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16],
const BN_ULONG exponent[16],
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
BN_ULONG k0,
BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
OPENSSL_INLINE int rsaz_avx2_capable(void) {
return CRYPTO_is_AVX2_capable();
}
OPENSSL_INLINE int rsaz_avx2_preferred(void) {
if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
CRYPTO_is_ADX_capable()) {
// If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
// .Lmulx4x_enter and .Lpowerx5_enter branches.
return 0;
}
return CRYPTO_is_AVX2_capable();
}
// Assembly functions.
// RSAZ represents 1024-bit integers using unsaturated 29-bit limbs stored in
// 64-bit integers. This requires 36 limbs but padded up to 40.
//
// See crypto/bn/asm/rsaz-avx2.pl for further details.
// rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation
// and writes the result to |red|.
void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]);
// rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|.
// Inputs and outputs are in Montgomery form, using RSAZ's representation. |k|
// is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k);
// rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to
// |ret|. Inputs and outputs are in Montgomery form, using RSAZ's
// representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|.
void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40],
const BN_ULONG n[40], BN_ULONG k, int count);
// rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
// positive and at most 31. It is treated as public. Note the table only uses 18
// |BN_ULONG|s per entry instead of 40. It packs two 29-bit limbs into each
// |BN_ULONG| and only stores 36 limbs rather than the padded 40.
void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
int i);
// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|. |i|
// must be positive and at most 31. It is treated as secret. |tbl| must be
// aligned to 32 bytes.
void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
int i);
// rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation
// and writes the result to |norm|. The result will be <= the modulus.
//
// WARNING: The result of this operation may not be fully reduced. |norm| may be
// the modulus instead of zero. This function should be followed by a call to
// |bn_reduce_once|.
void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]);
#if !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
#define RSAZ_512_ENABLED
// Dual Montgomery modular exponentiation using prime moduli of the
// same bit size, optimized with AVX512 ISA.
//
// Computes res|i| = base|i| ^ exp|i| mod m|i|.
//
// Input and output parameters for each exponentiation are independent and
// denoted here by index |i|, i = 1..2.
//
// Input and output are all in regular 2^64 radix.
//
// Each moduli shall be |modlen| bit size.
//
// Supported cases:
// - 2x1024
// - 2x1536
// - 2x2048
//
// [out] res|i| - result of modular exponentiation: array of qword values
// in regular (2^64) radix. Size of array shall be enough
// to hold |modlen| bits.
// [in] base|i| - base
// [in] exp|i| - exponent
// [in] m|i| - moduli
// [in] rr|i| - Montgomery parameter RR = R^2 mod m|i|
// [in] k0_|i| - Montgomery parameter k0 = -1/m|i| mod 2^64
// [in] modlen - moduli bit size
//
// \return 0 in case of failure,
// 1 in case of success.
//
// NB: This function does not do any checks on its arguments, its
// caller, `BN_mod_exp_mont_consttime_x2`, checks args. It should be
// the function used directly.
int RSAZ_mod_exp_avx512_x2(uint64_t *res1,
const uint64_t *base1,
const uint64_t *exponent1,
const uint64_t *m1,
const uint64_t *RR1,
uint64_t k0_1,
uint64_t *res2,
const uint64_t *base2,
const uint64_t *exponent2,
const uint64_t *m2,
const uint64_t *RR2,
uint64_t k0_2,
int modlen);
// Naming convention for the following functions:
//
// * amm: Almost Montgomery Multiplication
// * ams: Almost Montgomery Squaring
// * 52xZZ: data represented as array of ZZ digits in 52-bit radix
// * _x1_/_x2_: 1 or 2 independent inputs/outputs
// * ifma256: uses 256-bit wide IFMA ISA (AVX512_IFMA256)
//
//
// Almost Montgomery Multiplication (AMM) for 20-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high
// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 =
// -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 1040 > 1024 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x20_x1_ifma256(uint64_t *res, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
uint64_t k0);
// Dual Almost Montgomery Multiplication for 20-digit number in radix
// 2^52
//
// See description of rsaz_amm52x20_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
void rsaz_amm52x20_x2_ifma256(uint64_t *out, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
const uint64_t k0[2]);
// Constant time extraction from the precomputed table of powers
// base^i, where i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent
// base values and two independent moduli. The precomputed powers of
// the base values are stored contiguously in the table.
//
// Extracted value (output) is 2 20 digit numbers in 2^52 radix.
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x20_win5(uint64_t *red_Y,
const uint64_t *red_table,
int red_table_idx1, int red_table_idx2);
// Almost Montgomery Multiplication (AMM) for 30-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high
// bits zeroed
//
// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
//
// |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 1560 > 1536 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x30_x1_ifma256(uint64_t *res, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
uint64_t k0);
// Dual Almost Montgomery Multiplication for 30-digit number in radix
// 2^52
//
// See description of rsaz_amm52x30_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
//
// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
void rsaz_amm52x30_x2_ifma256(uint64_t *out, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
const uint64_t k0[2]);
// Constant time extraction from the precomputed table of powers
// base^i, where i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent
// base values. |red_table_idx1| and |red_table_idx2| are
// corresponding power indexes.
//
// Extracted value (output) is 2 (30 + 2) digits numbers in 2^52
// radix. (2 high QW is zero padding)
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x30_win5(uint64_t *red_Y,
const uint64_t *red_table,
int red_table_idx1, int red_table_idx2);
// Almost Montgomery Multiplication (AMM) for 40-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high
// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 =
// -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 2080 > 2048 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x40_x1_ifma256(uint64_t *res, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
uint64_t k0);
// Dual Almost Montgomery Multiplication for 40-digit number in radix
// 2^52
//
// See description of rsaz_amm52x40_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
void rsaz_amm52x40_x2_ifma256(uint64_t *out, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
const uint64_t k0[2]);
// Constant time extraction from the precomputed table of powers base^i, where
// i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent base values.
// |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
//
// Extracted value (output) is 2 40 digits numbers in 2^52 radix.
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x40_win5(uint64_t *red_Y,
const uint64_t *red_table,
int red_table_idx1, int red_table_idx2);
#endif // !MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
#endif // !OPENSSL_NO_ASM && OPENSSL_X86_64
#if defined(__cplusplus)
} // extern "C"
#endif
#endif // OPENSSL_HEADER_BN_RSAZ_EXP_H

View File

@@ -0,0 +1,616 @@
// Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
// Copyright (c) 2020-2021, Intel Corporation. All Rights Reserved.
//
// Originally written by Sergey Kirillov and Andrey Matyukov. Special
// thanks to Ilya Albrekht for his valuable hints.
//
// Intel Corporation
//
// SPDX-License-Identifier: Apache-2.0
#ifdef RSAZ_512_ENABLED
#include <openssl/crypto.h>
#include <assert.h>
#include "../../internal.h"
#include "rsaz_exp.h"
// Internal radix
# define DIGIT_SIZE (52)
// 52-bit mask
# define DIGIT_MASK ((uint64_t)0xFFFFFFFFFFFFF)
# define BITS2WORD8_SIZE(x) (((x) + 7) >> 3)
# define BITS2WORD64_SIZE(x) (((x) + 63) >> 6)
// Number of registers required to hold |digits_num| amount of qword
// digits
# define NUMBER_OF_REGISTERS(digits_num, register_size) \
(((digits_num) * 64 + (register_size) - 1) / (register_size))
OPENSSL_INLINE uint64_t get_digit(const uint8_t *in, int in_len);
OPENSSL_INLINE void put_digit(uint8_t *out, int out_len, uint64_t digit);
static void to_words52(uint64_t *out, int out_len, const uint64_t *in,
int in_bitsize);
static void from_words52(uint64_t *bn_out, int out_bitsize, const uint64_t *in);
OPENSSL_INLINE void set_bit(uint64_t *a, int idx);
// Number of |digit_size|-bit digits in |bitsize|-bit value
OPENSSL_INLINE int number_of_digits(int bitsize, int digit_size)
{
return (bitsize + digit_size - 1) / digit_size;
}
// Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of
// the same bit size using Almost Montgomery Multiplication, optimized with
// AVX512_IFMA256 ISA.
//
// The parameter w (window size) = 5.
//
// [out] res - result of modular exponentiation: 2x{20,30,40} qword
// values in 2^52 radix.
// [in] base - base (2x{20,30,40} qword values in 2^52 radix)
// [in] exp - array of 2 pointers to {16,24,32} qword values in 2^64 radix.
// Exponent is not converted to redundant representation.
// [in] m - moduli (2x{20,30,40} qword values in 2^52 radix)
// [in] rr - Montgomery parameter for 2 moduli:
// RR(1024) = 2^2080 mod m.
// RR(1536) = 2^3120 mod m.
// RR(2048) = 2^4160 mod m.
// (2x{20,30,40} qword values in 2^52 radix)
// [in] k0 - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64
//
// \return (void).
static int rsaz_mod_exp_x2_ifma256(uint64_t *res, const uint64_t *base,
const uint64_t *exp[2], const uint64_t *m,
const uint64_t *rr, const uint64_t k0[2],
int modlen);
// NB: This function does not do any checks on its arguments, its
// caller `BN_mod_exp_mont_consttime_x2`, checks args. It should be
// the function used directly.
int RSAZ_mod_exp_avx512_x2(uint64_t *res1,
const uint64_t *base1,
const uint64_t *exp1,
const uint64_t *m1,
const uint64_t *rr1,
uint64_t k0_1,
uint64_t *res2,
const uint64_t *base2,
const uint64_t *exp2,
const uint64_t *m2,
const uint64_t *rr2,
uint64_t k0_2,
int modlen)
{
#ifdef BORINGSSL_DISPATCH_TEST
BORINGSSL_function_hit[8] = 1;
#endif
typedef void (*AMM)(uint64_t *res, const uint64_t *a,
const uint64_t *b, const uint64_t *m, uint64_t k0);
int ret = 0;
// Number of word-size (uint64_t) digits to store values in
// redundant representation.
int red_digits = number_of_digits(modlen + 2, DIGIT_SIZE);
// n = modlen, d = DIGIT_SIZE, s = d * ceil((n+2)/d) > n
// k = 4 * (s - n) = bitlen_diff
//
// Given the Montgomery domain conversion value RR = R^2 mod m[i]
// = 2^2n mod m[i] and that for the larger representation in s
// bits, RR' = R'^2 mod m[i] = 2^2s mod m[i], bitlen_diff is
// needed to convert from RR to RR' as explained below in its
// calculation.
int bitlen_diff = 4 * (DIGIT_SIZE * red_digits - modlen);
// Number of YMM registers required to store a value
int num_ymm_regs = NUMBER_OF_REGISTERS(red_digits, 256);
// Capacity of the register set (in qwords = 64-bits) to store a
// value
int regs_capacity = num_ymm_regs * 4;
// The following 7 values are in redundant representation and are
// to be stored contiguously in storage_aligned as needed by the
// function rsaz_mod_exp_x2_ifma256.
uint64_t *base1_red, *m1_red, *rr1_red;
uint64_t *base2_red, *m2_red, *rr2_red;
uint64_t *coeff_red;
uint64_t *storage = NULL;
uint64_t *storage_aligned = NULL;
int storage_len_bytes = 7 * regs_capacity * sizeof(uint64_t)
+ 64; // alignment
const uint64_t *exp[2] = {0};
uint64_t k0[2] = {0};
// AMM = Almost Montgomery Multiplication
AMM amm = NULL;
switch (modlen) {
case 1024:
amm = rsaz_amm52x20_x1_ifma256;
break;
case 1536:
amm = rsaz_amm52x30_x1_ifma256;
break;
case 2048:
amm = rsaz_amm52x40_x1_ifma256;
break;
default:
goto err;
}
storage = (uint64_t *)OPENSSL_malloc(storage_len_bytes);
if (storage == NULL)
goto err;
storage_aligned = (uint64_t *)align_pointer(storage, 64);
// Memory layout for red(undant) representations
base1_red = storage_aligned;
base2_red = storage_aligned + 1 * regs_capacity;
m1_red = storage_aligned + 2 * regs_capacity;
m2_red = storage_aligned + 3 * regs_capacity;
rr1_red = storage_aligned + 4 * regs_capacity;
rr2_red = storage_aligned + 5 * regs_capacity;
coeff_red = storage_aligned + 6 * regs_capacity;
// Convert base_i, m_i, rr_i, from regular to 52-bit radix
to_words52(base1_red, regs_capacity, base1, modlen);
to_words52(base2_red, regs_capacity, base2, modlen);
to_words52(m1_red, regs_capacity, m1, modlen);
to_words52(m2_red, regs_capacity, m2, modlen);
to_words52(rr1_red, regs_capacity, rr1, modlen);
to_words52(rr2_red, regs_capacity, rr2, modlen);
// Based on the definition of n and s above, we have
// R = 2^n mod m; RR = R^2 mod m
// R' = 2^s mod m; RR' = R'^2 mod m
// To obtain R'^2 from R^2:
// - Let t = AMM(RR, RR) = R^4 / R' mod m -- (1)
// - Note that R'4 = R^4 * 2^{4*(s-n)} mod m
// - Let k = 4 * (s - n)
// - We have AMM(t, 2^k) = R^4 * 2^{4*(s-n)} / R'^2 mod m -- (2)
// = R'^4 / R'^2 mod m
// = R'^2 mod m
// For example, for n = 1024, s = 1040, k = 64,
// RR = 2^2048 mod m, RR' = 2^2080 mod m
OPENSSL_memset(coeff_red, 0, red_digits * sizeof(uint64_t));
// coeff_red = 2^k = 1 << bitlen_diff taking into account the
// redundant representation in digits of DIGIT_SIZE bits
set_bit(coeff_red, 64 * (int)(bitlen_diff / DIGIT_SIZE) + bitlen_diff % DIGIT_SIZE);
amm(rr1_red, rr1_red, rr1_red, m1_red, k0_1); // (1) for m1
amm(rr1_red, rr1_red, coeff_red, m1_red, k0_1); // (2) for m1
amm(rr2_red, rr2_red, rr2_red, m2_red, k0_2); // (1) for m2
amm(rr2_red, rr2_red, coeff_red, m2_red, k0_2); // (2) for m2
exp[0] = exp1;
exp[1] = exp2;
k0[0] = k0_1;
k0[1] = k0_2;
// Compute res|i| = base|i| ^ exp|i| mod m|i| in parallel in
// their contiguous form.
ret = rsaz_mod_exp_x2_ifma256(rr1_red, base1_red, exp, m1_red, rr1_red,
k0, modlen);
if (!ret)
goto err;
// Convert rr_i back to regular radix
from_words52(res1, modlen, rr1_red);
from_words52(res2, modlen, rr2_red);
// bn_reduce_once_in_place expects number of uint64_t, not bit
// size
modlen /= sizeof(uint64_t) * 8;
bn_reduce_once_in_place(res1, 0, m1, storage, modlen);
bn_reduce_once_in_place(res2, 0, m2, storage, modlen);
err:
if (storage != NULL) {
OPENSSL_cleanse(storage, storage_len_bytes);
OPENSSL_free(storage);
}
return ret;
}
int rsaz_mod_exp_x2_ifma256(uint64_t *out,
const uint64_t *base,
const uint64_t *exp[2],
const uint64_t *m,
const uint64_t *rr,
const uint64_t k0[2],
int modlen)
{
typedef void (*DAMM)(uint64_t *res, const uint64_t *a,
const uint64_t *b, const uint64_t *m,
const uint64_t k0[2]);
typedef void (*DEXTRACT)(uint64_t *res, const uint64_t *red_table,
int red_table_idx, int tbl_idx);
int ret = 0;
int idx;
// Exponent window size
int exp_win_size = 5;
int two_to_exp_win_size = 1U << exp_win_size;
int exp_win_mask = two_to_exp_win_size - 1;
// Number of digits (64-bit words) in redundant representation to
// handle modulus bits
int red_digits = 0;
// Number of digits (64-bit words) to store the two exponents,
// found in `exp`.
int exp_digits = 0;
uint64_t *storage = NULL;
uint64_t *storage_aligned = NULL;
int storage_len_bytes = 0;
// Red(undant) result Y and multiplier X
uint64_t *red_Y = NULL; // [2][red_digits]
uint64_t *red_X = NULL; // [2][red_digits]
/* Pre-computed table of base powers */
uint64_t *red_table = NULL; // [two_to_exp_win_size][2][red_digits]
// Expanded exponent
uint64_t *expz = NULL; // [2][exp_digits + 1]
// Dual AMM
DAMM damm = NULL;
// Extractor from red_table
DEXTRACT extract = NULL;
// Squaring is done using multiplication now. That can be a subject of
// optimization in future.
# define DAMS(r,a,m,k0) damm((r),(a),(a),(m),(k0))
switch (modlen) {
case 1024:
red_digits = 20;
exp_digits = 16;
damm = rsaz_amm52x20_x2_ifma256;
extract = extract_multiplier_2x20_win5;
break;
case 1536:
// Extended with 2 digits padding to avoid mask ops in high YMM register
red_digits = 30 + 2;
exp_digits = 24;
damm = rsaz_amm52x30_x2_ifma256;
extract = extract_multiplier_2x30_win5;
break;
case 2048:
red_digits = 40;
exp_digits = 32;
damm = rsaz_amm52x40_x2_ifma256;
extract = extract_multiplier_2x40_win5;
break;
default:
goto err;
}
// allocate space for 2x num digits, aligned because the data in
// the vectors need to be 64-bit aligned.
storage_len_bytes = (2 * red_digits // red_Y
+ 2 * red_digits // red_X
+ 2 * red_digits * two_to_exp_win_size // red_table
+ 2 * (exp_digits + 1)) // expz
* sizeof(uint64_t)
+ 64; // alignment
storage = (uint64_t *)OPENSSL_malloc(storage_len_bytes);
if (storage == NULL)
goto err;
OPENSSL_cleanse(storage, storage_len_bytes);
storage_aligned = (uint64_t *)align_pointer(storage, 64);
red_Y = storage_aligned;
red_X = red_Y + 2 * red_digits;
red_table = red_X + 2 * red_digits;
expz = red_table + 2 * red_digits * two_to_exp_win_size;
// Compute table of powers base^i mod m,
// i = 0, ..., (2^EXP_WIN_SIZE) - 1
// using the dual multiplication. Each table entry contains
// base1^i mod m1, then base2^i mod m2.
red_X[0 * red_digits] = 1;
red_X[1 * red_digits] = 1;
damm(&red_table[0 * 2 * red_digits], (const uint64_t*)red_X, rr, m, k0);
damm(&red_table[1 * 2 * red_digits], base, rr, m, k0);
for (idx = 1; idx < (int)(two_to_exp_win_size / 2); idx++) {
DAMS(&red_table[(2 * idx + 0) * 2 * red_digits],
&red_table[(1 * idx) * 2 * red_digits], m, k0);
damm(&red_table[(2 * idx + 1) * 2 * red_digits],
&red_table[(2 * idx) * 2 * red_digits],
&red_table[1 * 2 * red_digits], m, k0);
}
// Copy and expand exponents
memcpy(&expz[0 * (exp_digits + 1)], exp[0], exp_digits * sizeof(uint64_t));
expz[1 * (exp_digits + 1) - 1] = 0;
memcpy(&expz[1 * (exp_digits + 1)], exp[1], exp_digits * sizeof(uint64_t));
expz[2 * (exp_digits + 1) - 1] = 0;
// Exponentiation
//
// This is Algorithm 3 in iacr 2011-239 which is cited below as
// well.
//
// Rather than compute base^{exp} in one shot, the powers of
// base^i for i = [0..2^{exp_win_size}) are precomputed and stored
// in `red_table`. Each window of the exponent is then used as an
// index to look up the power in the table, and then that result
// goes through a "series of squaring", which repositions it with
// respect to where it appears in the complete exponent. That
// result is then multiplied by the previous result.
//
// The `extract` routine does the lookup, `DAMS` wraps the `damm`
// routine to set up squaring, while `damm` is the AMM
// routine. That is what you find happening in each iteration of
// this loop—the stepping through the exponent one
// `win_exp_size`-bit window at a time.
{
const int rem = modlen % exp_win_size;
const uint64_t table_idx_mask = exp_win_mask;
int exp_bit_no = modlen - rem;
int exp_chunk_no = exp_bit_no / 64;
int exp_chunk_shift = exp_bit_no % 64;
uint64_t red_table_idx_1, red_table_idx_2;
// `rem` is { 1024, 1536, 2048 } % 5 which is { 4, 1, 3 }
// respectively.
//
// If this assertion ever fails then we should set this easy
// fix exp_bit_no = modlen - exp_win_size
assert(rem == 4 || rem == 1 || rem == 3);
// Find the location of the 5-bit window in the exponent which
// is stored in 64-bit digits. Left pad it with 0s to form a
// 64-bit digit to become an index in the precomputed table.
// The window location in the exponent is identified by its
// least significant bit `exp_bit_no`.
#define EXP_CHUNK(i) (exp_chunk_no) + ((i) * (exp_digits + 1))
#define EXP_CHUNK1(i) (exp_chunk_no) + 1 + ((i) * (exp_digits + 1))
// Process 1-st exp window - just init result
red_table_idx_1 = expz[EXP_CHUNK(0)];
red_table_idx_2 = expz[EXP_CHUNK(1)];
// The function operates with fixed moduli sizes divisible by
// 64, thus table index here is always in supported range [0,
// EXP_WIN_SIZE).
red_table_idx_1 >>= exp_chunk_shift;
red_table_idx_2 >>= exp_chunk_shift;
extract(&red_Y[0 * red_digits], (const uint64_t*)red_table,
(int)red_table_idx_1, (int)red_table_idx_2);
// Process other exp windows
for (exp_bit_no -= exp_win_size; exp_bit_no >= 0; exp_bit_no -= exp_win_size) {
// Extract pre-computed multiplier from the table
{
uint64_t T;
exp_chunk_no = exp_bit_no / 64;
exp_chunk_shift = exp_bit_no % 64;
{
red_table_idx_1 = expz[EXP_CHUNK(0)];
T = expz[EXP_CHUNK1(0)];
red_table_idx_1 >>= exp_chunk_shift;
// Get additional bits from then next quadword
// when 64-bit boundaries are crossed.
if (exp_chunk_shift > 64 - exp_win_size) {
T <<= (64 - exp_chunk_shift);
red_table_idx_1 ^= T;
}
red_table_idx_1 &= table_idx_mask;
}
{
red_table_idx_2 = expz[EXP_CHUNK(1)];
T = expz[EXP_CHUNK1(1)];
red_table_idx_2 >>= exp_chunk_shift;
// Get additional bits from then next quadword
// when 64-bit boundaries are crossed.
if (exp_chunk_shift > 64 - exp_win_size) {
T <<= (64 - exp_chunk_shift);
red_table_idx_2 ^= T;
}
red_table_idx_2 &= table_idx_mask;
}
extract(&red_X[0 * red_digits], (const uint64_t*)red_table,
(int)red_table_idx_1, (int)red_table_idx_2);
}
// The number of squarings is equal to the window size.
DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
DAMS((uint64_t*)red_Y, (const uint64_t*)red_Y, m, k0);
damm((uint64_t*)red_Y, (const uint64_t*)red_Y, (const uint64_t*)red_X, m, k0);
}
}
// NB: After the last AMM of exponentiation in Montgomery domain, the result
// may be (modlen + 1), but the conversion out of Montgomery domain
// performs an AMM(x,1) which guarantees that the final result is less than
// |m|, so no conditional subtraction is needed here. See [1] for details.
//
// [1] Gueron, S. Efficient software implementations of modular exponentiation.
// DOI: 10.1007/s13389-012-0031-5
// Convert exponentiation result out of Montgomery form but still
// in the redundant DIGIT_SIZE-bit representation.
memset(red_X, 0, 2 * red_digits * sizeof(uint64_t));
red_X[0 * red_digits] = 1;
red_X[1 * red_digits] = 1;
damm(out, (const uint64_t*)red_Y, (const uint64_t*)red_X, m, k0);
ret = 1;
err:
if (storage != NULL) {
// Clear whole storage
OPENSSL_cleanse(storage, storage_len_bytes);
OPENSSL_free(storage);
}
#undef DAMS
return ret;
}
// Compute the digit represented by the bytes given in |in|.
OPENSSL_INLINE uint64_t get_digit(const uint8_t *in, int in_len)
{
uint64_t digit = 0;
assert(in != NULL);
assert(in_len <= 8);
for (; in_len > 0; in_len--) {
digit <<= 8;
digit += (uint64_t)(in[in_len - 1]);
}
return digit;
}
// Convert array of words in regular (base=2^64) representation to
// array of words in redundant (base=2^52) one. This is because the
// multiply/add instruction uses 52-bit representations to leave room
// for carries.
static void to_words52(uint64_t *out, int out_len,
const uint64_t *in, int in_bitsize)
{
uint8_t *in_str = NULL;
assert(out != NULL);
assert(in != NULL);
// Check destination buffer capacity
assert(out_len >= number_of_digits(in_bitsize, DIGIT_SIZE));
in_str = (uint8_t *)in;
for (; in_bitsize >= (2 * DIGIT_SIZE); in_bitsize -= (2 * DIGIT_SIZE), out += 2) {
uint64_t digit;
memcpy(&digit, in_str, sizeof(digit));
out[0] = digit & DIGIT_MASK;
in_str += 6;
memcpy(&digit, in_str, sizeof(digit));
out[1] = (digit >> 4) & DIGIT_MASK;
in_str += 7;
out_len -= 2;
}
if (in_bitsize > DIGIT_SIZE) {
uint64_t digit = get_digit(in_str, 7);
out[0] = digit & DIGIT_MASK;
in_str += 6;
in_bitsize -= DIGIT_SIZE;
digit = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
out[1] = digit >> 4;
out += 2;
out_len -= 2;
} else if (in_bitsize > 0) {
out[0] = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
out++;
out_len--;
}
while (out_len > 0) {
*out = 0;
out_len--;
out++;
}
}
// Convert a 64-bit unsigned integer into a byte array, |out|, which
// is in little-endian order.
OPENSSL_INLINE void put_digit(uint8_t *out, int out_len, uint64_t digit)
{
assert(out != NULL);
assert(out_len <= 8);
for (; out_len > 0; out_len--) {
*out++ = (uint8_t)(digit & 0xFF);
digit >>= 8;
}
}
// Convert array of words in redundant (base=2^52) representation to
// array of words in regular (base=2^64) one. This is because the
// multiply/add instruction uses 52-bit representations to leave room
// for carries.
static void from_words52(uint64_t *out, int out_bitsize, const uint64_t *in)
{
int i;
int out_len = BITS2WORD64_SIZE(out_bitsize);
assert(out != NULL);
assert(in != NULL);
for (i = 0; i < out_len; i++)
out[i] = 0;
{
uint8_t *out_str = (uint8_t *)out;
for (; out_bitsize >= (2 * DIGIT_SIZE);
out_bitsize -= (2 * DIGIT_SIZE), in += 2) {
uint64_t digit;
digit = in[0];
memcpy(out_str, &digit, sizeof(digit));
out_str += 6;
digit = digit >> 48 | in[1] << 4;
memcpy(out_str, &digit, sizeof(digit));
out_str += 7;
}
if (out_bitsize > DIGIT_SIZE) {
put_digit(out_str, 7, in[0]);
out_str += 6;
out_bitsize -= DIGIT_SIZE;
put_digit(out_str, BITS2WORD8_SIZE(out_bitsize),
(in[1] << 4 | in[0] >> 48));
} else if (out_bitsize) {
put_digit(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]);
}
}
}
// Set bit at index |idx| in the words array |a|. It does not do any
// boundaries checks, make sure the index is valid before calling the
// function.
OPENSSL_INLINE void set_bit(uint64_t *a, int idx)
{
assert(a != NULL);
{
int i, j;
i = idx / BN_BITS2;
j = idx % BN_BITS2;
a[i] |= (((uint64_t)1) << j);
}
}
#endif

View File

@@ -0,0 +1,311 @@
// Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <string.h>
#include <openssl/err.h>
#include <openssl/type_check.h>
#include "internal.h"
int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) {
int i, nw, lb, rb;
BN_ULONG *t, *f;
BN_ULONG l;
if (n < 0) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
r->neg = a->neg;
nw = n / BN_BITS2;
if (!bn_wexpand(r, a->width + nw + 1)) {
return 0;
}
lb = n % BN_BITS2;
rb = BN_BITS2 - lb;
f = a->d;
t = r->d;
t[a->width + nw] = 0;
if (lb == 0) {
for (i = a->width - 1; i >= 0; i--) {
t[nw + i] = f[i];
}
} else {
for (i = a->width - 1; i >= 0; i--) {
l = f[i];
t[nw + i + 1] |= l >> rb;
t[nw + i] = l << lb;
}
}
OPENSSL_memset(t, 0, nw * sizeof(t[0]));
r->width = a->width + nw + 1;
bn_set_minimal_width(r);
return 1;
}
int BN_lshift1(BIGNUM *r, const BIGNUM *a) {
BN_ULONG *ap, *rp, t, c;
int i;
if (r != a) {
r->neg = a->neg;
if (!bn_wexpand(r, a->width + 1)) {
return 0;
}
r->width = a->width;
} else {
if (!bn_wexpand(r, a->width + 1)) {
return 0;
}
}
ap = a->d;
rp = r->d;
c = 0;
for (i = 0; i < a->width; i++) {
t = *(ap++);
*(rp++) = (t << 1) | c;
c = t >> (BN_BITS2 - 1);
}
if (c) {
*rp = 1;
r->width++;
}
return 1;
}
void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift,
size_t num) {
unsigned shift_bits = shift % BN_BITS2;
size_t shift_words = shift / BN_BITS2;
if (shift_words >= num) {
OPENSSL_memset(r, 0, num * sizeof(BN_ULONG));
return;
}
if (shift_bits == 0) {
OPENSSL_memmove(r, a + shift_words, (num - shift_words) * sizeof(BN_ULONG));
} else {
for (size_t i = shift_words; i < num - 1; i++) {
r[i - shift_words] =
(a[i] >> shift_bits) | (a[i + 1] << (BN_BITS2 - shift_bits));
}
r[num - 1 - shift_words] = a[num - 1] >> shift_bits;
}
OPENSSL_memset(r + num - shift_words, 0, shift_words * sizeof(BN_ULONG));
}
int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
if (n < 0) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
if (!bn_wexpand(r, a->width)) {
return 0;
}
bn_rshift_words(r->d, a->d, n, a->width);
r->neg = a->neg;
r->width = a->width;
bn_set_minimal_width(r);
return 1;
}
int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a, unsigned n,
BN_CTX *ctx) {
int ret = 0;
BN_CTX_start(ctx);
BIGNUM *tmp = BN_CTX_get(ctx);
if (tmp == NULL ||
!BN_copy(r, a) ||
!bn_wexpand(tmp, r->width)) {
goto err;
}
// Shift conditionally by powers of two.
unsigned max_bits = BN_BITS2 * r->width;
for (unsigned i = 0; (max_bits >> i) != 0; i++) {
BN_ULONG mask = (n >> i) & 1;
mask = 0 - mask;
bn_rshift_words(tmp->d, r->d, 1u << i, r->width);
bn_select_words(r->d, mask, tmp->d /* apply shift */,
r->d /* ignore shift */, r->width);
}
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num) {
if (num == 0) {
return;
}
for (size_t i = 0; i < num - 1; i++) {
r[i] = (a[i] >> 1) | (a[i + 1] << (BN_BITS2 - 1));
}
r[num - 1] = a[num - 1] >> 1;
}
int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
if (!bn_wexpand(r, a->width)) {
return 0;
}
bn_rshift1_words(r->d, a->d, a->width);
r->width = a->width;
r->neg = a->neg;
bn_set_minimal_width(r);
return 1;
}
int BN_set_bit(BIGNUM *a, int n) {
if (n < 0) {
return 0;
}
int i = n / BN_BITS2;
int j = n % BN_BITS2;
if (a->width <= i) {
if (!bn_wexpand(a, i + 1)) {
return 0;
}
for (int k = a->width; k < i + 1; k++) {
a->d[k] = 0;
}
a->width = i + 1;
}
a->d[i] |= (((BN_ULONG)1) << j);
return 1;
}
int BN_clear_bit(BIGNUM *a, int n) {
int i, j;
if (n < 0) {
return 0;
}
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->width <= i) {
return 0;
}
a->d[i] &= (~(((BN_ULONG)1) << j));
bn_set_minimal_width(a);
return 1;
}
int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit) {
size_t i = bit / BN_BITS2;
size_t j = bit % BN_BITS2;
if (i >= num) {
return 0;
}
return (a[i] >> j) & 1;
}
int BN_is_bit_set(const BIGNUM *a, int n) {
if (n < 0) {
return 0;
}
return bn_is_bit_set_words(a->d, a->width, n);
}
int BN_mask_bits(BIGNUM *a, int n) {
if (n < 0) {
return 0;
}
int w = n / BN_BITS2;
int b = n % BN_BITS2;
if (w >= a->width) {
return 1;
}
if (b == 0) {
a->width = w;
} else {
a->width = w + 1;
a->d[w] &= ~(BN_MASK2 << b);
}
bn_set_minimal_width(a);
return 1;
}
static int bn_count_low_zero_bits_word(BN_ULONG l) {
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
OPENSSL_STATIC_ASSERT(sizeof(int) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
OPENSSL_STATIC_ASSERT(BN_BITS2 == sizeof(BN_ULONG) * 8,
BN_ULONG_has_padding_bits)
// C has very bizarre rules for types smaller than an int.
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) >= sizeof(int),
BN_ULONG_gets_promoted_to_int)
crypto_word_t mask;
int bits = 0;
#if BN_BITS2 > 32
// Check if the lower half of |x| are all zero.
mask = constant_time_is_zero_w(l << (BN_BITS2 - 32));
// If the lower half is all zeros, it is included in the bit count and we
// count the upper half. Otherwise, we count the lower half.
bits += 32 & mask;
l = constant_time_select_w(mask, l >> 32, l);
#endif
// The remaining blocks are analogous iterations at lower powers of two.
mask = constant_time_is_zero_w(l << (BN_BITS2 - 16));
bits += 16 & mask;
l = constant_time_select_w(mask, l >> 16, l);
mask = constant_time_is_zero_w(l << (BN_BITS2 - 8));
bits += 8 & mask;
l = constant_time_select_w(mask, l >> 8, l);
mask = constant_time_is_zero_w(l << (BN_BITS2 - 4));
bits += 4 & mask;
l = constant_time_select_w(mask, l >> 4, l);
mask = constant_time_is_zero_w(l << (BN_BITS2 - 2));
bits += 2 & mask;
l = constant_time_select_w(mask, l >> 2, l);
mask = constant_time_is_zero_w(l << (BN_BITS2 - 1));
bits += 1 & mask;
return bits;
}
int BN_count_low_zero_bits(const BIGNUM *bn) {
OPENSSL_STATIC_ASSERT(sizeof(BN_ULONG) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
OPENSSL_STATIC_ASSERT(sizeof(int) <= sizeof(crypto_word_t),
crypto_word_t_is_too_small)
int ret = 0;
crypto_word_t saw_nonzero = 0;
for (int i = 0; i < bn->width; i++) {
crypto_word_t nonzero = ~constant_time_is_zero_w(bn->d[i]);
crypto_word_t first_nonzero = ~saw_nonzero & nonzero;
saw_nonzero |= nonzero;
int bits = bn_count_low_zero_bits_word(bn->d[i]);
ret |= first_nonzero & (i * BN_BITS2 + bits);
}
// If got to the end of |bn| and saw no non-zero words, |bn| is zero. |ret|
// will then remain zero.
return ret;
}

View File

@@ -0,0 +1,450 @@
// Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de> and Bodo Moeller for the OpenSSL project.
// Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include <openssl/bn.h>
#include <openssl/err.h>
#include "internal.h"
BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
// Compute a square root of |a| mod |p| using the Tonelli/Shanks algorithm
// (cf. Henri Cohen, "A Course in Algebraic Computational Number Theory",
// algorithm 1.5.1). |p| must be prime, otherwise an error or
// an incorrect "result" will be returned.
BIGNUM *ret = in;
int err = 1;
int r;
BIGNUM *A, *b, *q, *t, *x, *y;
int e, i, j;
if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
if (BN_abs_is_word(p, 2)) {
if (ret == NULL) {
ret = BN_new();
}
if (ret == NULL ||
!BN_set_word(ret, BN_is_bit_set(a, 0))) {
if (ret != in) {
BN_free(ret);
}
return NULL;
}
return ret;
}
OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
return NULL;
}
if (BN_is_zero(a) || BN_is_one(a)) {
if (ret == NULL) {
ret = BN_new();
}
if (ret == NULL ||
!BN_set_word(ret, BN_is_one(a))) {
if (ret != in) {
BN_free(ret);
}
return NULL;
}
return ret;
}
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
q = BN_CTX_get(ctx);
t = BN_CTX_get(ctx);
x = BN_CTX_get(ctx);
y = BN_CTX_get(ctx);
if (y == NULL) {
goto end;
}
if (ret == NULL) {
ret = BN_new();
}
if (ret == NULL) {
goto end;
}
// A = a mod p
if (!BN_nnmod(A, a, p, ctx)) {
goto end;
}
// now write |p| - 1 as 2^e*q where q is odd
e = 1;
while (!BN_is_bit_set(p, e)) {
e++;
}
// we'll set q later (if needed)
if (e == 1) {
// The easy case: (|p|-1)/2 is odd, so 2 has an inverse
// modulo (|p|-1)/2, and square roots can be computed
// directly by modular exponentiation.
// We have
// 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
// so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
if (!BN_rshift(q, p, 2)) {
goto end;
}
q->neg = 0;
if (!BN_add_word(q, 1) ||
!BN_mod_exp_mont(ret, A, q, p, ctx, NULL)) {
goto end;
}
err = 0;
goto vrfy;
}
if (e == 2) {
// |p| == 5 (mod 8)
//
// In this case 2 is always a non-square since
// Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
// So if a really is a square, then 2*a is a non-square.
// Thus for
// b := (2*a)^((|p|-5)/8),
// i := (2*a)*b^2
// we have
// i^2 = (2*a)^((1 + (|p|-5)/4)*2)
// = (2*a)^((p-1)/2)
// = -1;
// so if we set
// x := a*b*(i-1),
// then
// x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
// = a^2 * b^2 * (-2*i)
// = a*(-i)*(2*a*b^2)
// = a*(-i)*i
// = a.
//
// (This is due to A.O.L. Atkin,
// <URL:
//http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
// November 1992.)
// t := 2*a
if (!bn_mod_lshift1_consttime(t, A, p, ctx)) {
goto end;
}
// b := (2*a)^((|p|-5)/8)
if (!BN_rshift(q, p, 3)) {
goto end;
}
q->neg = 0;
if (!BN_mod_exp_mont(b, t, q, p, ctx, NULL)) {
goto end;
}
// y := b^2
if (!BN_mod_sqr(y, b, p, ctx)) {
goto end;
}
// t := (2*a)*b^2 - 1
if (!BN_mod_mul(t, t, y, p, ctx) ||
!BN_sub_word(t, 1)) {
goto end;
}
// x = a*b*t
if (!BN_mod_mul(x, A, b, p, ctx) ||
!BN_mod_mul(x, x, t, p, ctx)) {
goto end;
}
if (!BN_copy(ret, x)) {
goto end;
}
err = 0;
goto vrfy;
}
// e > 2, so we really have to use the Tonelli/Shanks algorithm.
// First, find some y that is not a square.
if (!BN_copy(q, p)) {
goto end; // use 'q' as temp
}
q->neg = 0;
i = 2;
do {
// For efficiency, try small numbers first;
// if this fails, try random numbers.
if (i < 22) {
if (!BN_set_word(y, i)) {
goto end;
}
} else {
if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) {
goto end;
}
if (BN_ucmp(y, p) >= 0) {
if (!(p->neg ? BN_add : BN_sub)(y, y, p)) {
goto end;
}
}
// now 0 <= y < |p|
if (BN_is_zero(y)) {
if (!BN_set_word(y, i)) {
goto end;
}
}
}
r = bn_jacobi(y, q, ctx); // here 'q' is |p|
if (r < -1) {
goto end;
}
if (r == 0) {
// m divides p
OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
goto end;
}
} while (r == 1 && ++i < 82);
if (r != -1) {
// Many rounds and still no non-square -- this is more likely
// a bug than just bad luck.
// Even if p is not prime, we should have found some y
// such that r == -1.
OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS);
goto end;
}
// Here's our actual 'q':
if (!BN_rshift(q, q, e)) {
goto end;
}
// Now that we have some non-square, we can find an element
// of order 2^e by computing its q'th power.
if (!BN_mod_exp_mont(y, y, q, p, ctx, NULL)) {
goto end;
}
if (BN_is_one(y)) {
OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME);
goto end;
}
// Now we know that (if p is indeed prime) there is an integer
// k, 0 <= k < 2^e, such that
//
// a^q * y^k == 1 (mod p).
//
// As a^q is a square and y is not, k must be even.
// q+1 is even, too, so there is an element
//
// X := a^((q+1)/2) * y^(k/2),
//
// and it satisfies
//
// X^2 = a^q * a * y^k
// = a,
//
// so it is the square root that we are looking for.
// t := (q-1)/2 (note that q is odd)
if (!BN_rshift1(t, q)) {
goto end;
}
// x := a^((q-1)/2)
if (BN_is_zero(t)) { // special case: p = 2^e + 1
if (!BN_nnmod(t, A, p, ctx)) {
goto end;
}
if (BN_is_zero(t)) {
// special case: a == 0 (mod p)
BN_zero(ret);
err = 0;
goto end;
} else if (!BN_one(x)) {
goto end;
}
} else {
if (!BN_mod_exp_mont(x, A, t, p, ctx, NULL)) {
goto end;
}
if (BN_is_zero(x)) {
// special case: a == 0 (mod p)
BN_zero(ret);
err = 0;
goto end;
}
}
// b := a*x^2 (= a^q)
if (!BN_mod_sqr(b, x, p, ctx) ||
!BN_mod_mul(b, b, A, p, ctx)) {
goto end;
}
// x := a*x (= a^((q+1)/2))
if (!BN_mod_mul(x, x, A, p, ctx)) {
goto end;
}
while (1) {
// Now b is a^q * y^k for some even k (0 <= k < 2^E
// where E refers to the original value of e, which we
// don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
//
// We have a*b = x^2,
// y^2^(e-1) = -1,
// b^2^(e-1) = 1.
if (BN_is_one(b)) {
if (!BN_copy(ret, x)) {
goto end;
}
err = 0;
goto vrfy;
}
// Find the smallest i, 0 < i < e, such that b^(2^i) = 1
for (i = 1; i < e; i++) {
if (i == 1) {
if (!BN_mod_sqr(t, b, p, ctx)) {
goto end;
}
} else {
if (!BN_mod_mul(t, t, t, p, ctx)) {
goto end;
}
}
if (BN_is_one(t)) {
break;
}
}
// If not found, a is not a square or p is not a prime.
if (i >= e) {
OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
goto end;
}
// t := y^2^(e - i - 1)
if (!BN_copy(t, y)) {
goto end;
}
for (j = e - i - 1; j > 0; j--) {
if (!BN_mod_sqr(t, t, p, ctx)) {
goto end;
}
}
if (!BN_mod_mul(y, t, t, p, ctx) ||
!BN_mod_mul(x, x, t, p, ctx) ||
!BN_mod_mul(b, b, y, p, ctx)) {
goto end;
}
// e decreases each iteration, so this loop will terminate.
assert(i < e);
e = i;
}
vrfy:
if (!err) {
// Verify the result. The input might have been not a square.
if (!BN_mod_sqr(x, ret, p, ctx)) {
err = 1;
}
if (!err && 0 != BN_cmp(x, A)) {
OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
err = 1;
}
}
end:
if (err) {
if (ret != in) {
BN_clear_free(ret);
}
ret = NULL;
}
BN_CTX_end(ctx);
return ret;
}
int BN_sqrt(BIGNUM *out_sqrt, const BIGNUM *in, BN_CTX *ctx) {
BIGNUM *estimate, *tmp, *delta, *last_delta, *tmp2;
int ok = 0, last_delta_valid = 0;
if (in->neg) {
OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER);
return 0;
}
if (BN_is_zero(in)) {
BN_zero(out_sqrt);
return 1;
}
BN_CTX_start(ctx);
if (out_sqrt == in) {
estimate = BN_CTX_get(ctx);
} else {
estimate = out_sqrt;
}
tmp = BN_CTX_get(ctx);
last_delta = BN_CTX_get(ctx);
delta = BN_CTX_get(ctx);
if (estimate == NULL || tmp == NULL || last_delta == NULL || delta == NULL) {
goto err;
}
// We estimate that the square root of an n-bit number is 2^{n/2}.
if (!BN_lshift(estimate, BN_value_one(), BN_num_bits(in)/2)) {
goto err;
}
// This is Newton's method for finding a root of the equation |estimate|^2 -
// |in| = 0.
for (;;) {
// |estimate| = 1/2 * (|estimate| + |in|/|estimate|)
if (!BN_div(tmp, NULL, in, estimate, ctx) ||
!BN_add(tmp, tmp, estimate) ||
!BN_rshift1(estimate, tmp) ||
// |tmp| = |estimate|^2
!BN_sqr(tmp, estimate, ctx) ||
// |delta| = |in| - |tmp|
!BN_sub(delta, in, tmp)) {
OPENSSL_PUT_ERROR(BN, ERR_R_BN_LIB);
goto err;
}
delta->neg = 0;
// The difference between |in| and |estimate| squared is required to always
// decrease. This ensures that the loop always terminates, but I don't have
// a proof that it always finds the square root for a given square.
if (last_delta_valid && BN_cmp(delta, last_delta) >= 0) {
break;
}
last_delta_valid = 1;
tmp2 = last_delta;
last_delta = delta;
delta = tmp2;
}
if (BN_cmp(tmp, in) != 0) {
OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE);
goto err;
}
ok = 1;
err:
if (ok && out_sqrt == in && !BN_copy(out_sqrt, estimate)) {
ok = 0;
}
BN_CTX_end(ctx);
return ok;
}