chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

235
vendor/sha2/src/sha512/aarch64.rs vendored Normal file
View File

@@ -0,0 +1,235 @@
// Implementation adapted from mbedtls.
use core::arch::{aarch64::*, asm};
use crate::consts::K64;
cpufeatures::new!(sha3_hwcap, "sha3");
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
// after stabilization
if sha3_hwcap::get() {
unsafe { sha512_compress(state, blocks) }
} else {
super::soft::compress(state, blocks);
}
}
#[target_feature(enable = "sha3")]
unsafe fn sha512_compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
// SAFETY: Requires the sha3 feature.
// Load state into vectors.
let mut ab = vld1q_u64(state[0..2].as_ptr());
let mut cd = vld1q_u64(state[2..4].as_ptr());
let mut ef = vld1q_u64(state[4..6].as_ptr());
let mut gh = vld1q_u64(state[6..8].as_ptr());
// Iterate through the message blocks.
for block in blocks {
// Keep original state values.
let ab_orig = ab;
let cd_orig = cd;
let ef_orig = ef;
let gh_orig = gh;
// Load the message block into vectors, assuming little endianness.
let mut s0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[0..16].as_ptr())));
let mut s1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[16..32].as_ptr())));
let mut s2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[32..48].as_ptr())));
let mut s3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[48..64].as_ptr())));
let mut s4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[64..80].as_ptr())));
let mut s5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[80..96].as_ptr())));
let mut s6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[96..112].as_ptr())));
let mut s7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[112..128].as_ptr())));
// Rounds 0 and 1
let mut initial_sum = vaddq_u64(s0, vld1q_u64(&K64[0]));
let mut sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
let mut intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds 2 and 3
initial_sum = vaddq_u64(s1, vld1q_u64(&K64[2]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds 4 and 5
initial_sum = vaddq_u64(s2, vld1q_u64(&K64[4]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds 6 and 7
initial_sum = vaddq_u64(s3, vld1q_u64(&K64[6]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
// Rounds 8 and 9
initial_sum = vaddq_u64(s4, vld1q_u64(&K64[8]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds 10 and 11
initial_sum = vaddq_u64(s5, vld1q_u64(&K64[10]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds 12 and 13
initial_sum = vaddq_u64(s6, vld1q_u64(&K64[12]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds 14 and 15
initial_sum = vaddq_u64(s7, vld1q_u64(&K64[14]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
for t in (16..80).step_by(16) {
// Rounds t and t + 1
s0 = vsha512su1q_u64(vsha512su0q_u64(s0, s1), s7, vextq_u64(s4, s5, 1));
initial_sum = vaddq_u64(s0, vld1q_u64(&K64[t]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds t + 2 and t + 3
s1 = vsha512su1q_u64(vsha512su0q_u64(s1, s2), s0, vextq_u64(s5, s6, 1));
initial_sum = vaddq_u64(s1, vld1q_u64(&K64[t + 2]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds t + 4 and t + 5
s2 = vsha512su1q_u64(vsha512su0q_u64(s2, s3), s1, vextq_u64(s6, s7, 1));
initial_sum = vaddq_u64(s2, vld1q_u64(&K64[t + 4]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds t + 6 and t + 7
s3 = vsha512su1q_u64(vsha512su0q_u64(s3, s4), s2, vextq_u64(s7, s0, 1));
initial_sum = vaddq_u64(s3, vld1q_u64(&K64[t + 6]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
// Rounds t + 8 and t + 9
s4 = vsha512su1q_u64(vsha512su0q_u64(s4, s5), s3, vextq_u64(s0, s1, 1));
initial_sum = vaddq_u64(s4, vld1q_u64(&K64[t + 8]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(intermed, cd, ab);
cd = vaddq_u64(cd, intermed);
// Rounds t + 10 and t + 11
s5 = vsha512su1q_u64(vsha512su0q_u64(s5, s6), s4, vextq_u64(s1, s2, 1));
initial_sum = vaddq_u64(s5, vld1q_u64(&K64[t + 10]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(intermed, ab, gh);
ab = vaddq_u64(ab, intermed);
// Rounds t + 12 and t + 13
s6 = vsha512su1q_u64(vsha512su0q_u64(s6, s7), s5, vextq_u64(s2, s3, 1));
initial_sum = vaddq_u64(s6, vld1q_u64(&K64[t + 12]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(intermed, gh, ef);
gh = vaddq_u64(gh, intermed);
// Rounds t + 14 and t + 15
s7 = vsha512su1q_u64(vsha512su0q_u64(s7, s0), s6, vextq_u64(s3, s4, 1));
initial_sum = vaddq_u64(s7, vld1q_u64(&K64[t + 14]));
sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(intermed, ef, cd);
ef = vaddq_u64(ef, intermed);
}
// Add the block-specific state to the original state.
ab = vaddq_u64(ab, ab_orig);
cd = vaddq_u64(cd, cd_orig);
ef = vaddq_u64(ef, ef_orig);
gh = vaddq_u64(gh, gh_orig);
}
// Store vectors into state.
vst1q_u64(state[0..2].as_mut_ptr(), ab);
vst1q_u64(state[2..4].as_mut_ptr(), cd);
vst1q_u64(state[4..6].as_mut_ptr(), ef);
vst1q_u64(state[6..8].as_mut_ptr(), gh);
}
// TODO remove these polyfills once SHA3 intrinsics land
#[inline(always)]
unsafe fn vsha512hq_u64(
mut hash_ed: uint64x2_t,
hash_gf: uint64x2_t,
kwh_kwh2: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512H {:q}, {:q}, {:v}.2D",
inout(vreg) hash_ed, in(vreg) hash_gf, in(vreg) kwh_kwh2,
options(pure, nomem, nostack, preserves_flags)
);
hash_ed
}
#[inline(always)]
unsafe fn vsha512h2q_u64(
mut sum_ab: uint64x2_t,
hash_c_: uint64x2_t,
hash_ab: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512H2 {:q}, {:q}, {:v}.2D",
inout(vreg) sum_ab, in(vreg) hash_c_, in(vreg) hash_ab,
options(pure, nomem, nostack, preserves_flags)
);
sum_ab
}
#[inline(always)]
unsafe fn vsha512su0q_u64(mut w0_1: uint64x2_t, w2_: uint64x2_t) -> uint64x2_t {
asm!(
"SHA512SU0 {:v}.2D, {:v}.2D",
inout(vreg) w0_1, in(vreg) w2_,
options(pure, nomem, nostack, preserves_flags)
);
w0_1
}
#[inline(always)]
unsafe fn vsha512su1q_u64(
mut s01_s02: uint64x2_t,
w14_15: uint64x2_t,
w9_10: uint64x2_t,
) -> uint64x2_t {
asm!(
"SHA512SU1 {:v}.2D, {:v}.2D, {:v}.2D",
inout(vreg) s01_s02, in(vreg) w14_15, in(vreg) w9_10,
options(pure, nomem, nostack, preserves_flags)
);
s01_s02
}

View File

@@ -0,0 +1,242 @@
//! LoongArch64 assembly backend
macro_rules! c {
($($l:expr)*) => {
concat!($($l ,)*)
};
}
macro_rules! rounda {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.d $a5, $a1, (" $i " * 8);"
"revb.d $a5, $a5;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundb {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
"ld.d $a4, $sp, (((" $i " - 15) & 0xF) * 8);"
"ld.d $a5, $sp, (((" $i " - 16) & 0xF) * 8);"
"ld.d $a6, $sp, (((" $i " - 7) & 0xF) * 8);"
"add.d $a5, $a5, $a6;"
"rotri.d $a6, $a4, 8;"
"srli.d $a7, $a4, 7;"
"rotri.d $a4, $a4, 1;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d $a5, $a5, $a4;"
"ld.d $a4, $sp, (((" $i " - 2) & 0xF) * 8);"
"rotri.d $a6, $a4, 61;"
"srli.d $a7, $a4, 6;"
"rotri.d $a4, $a4, 19;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d $a5, $a5, $a4;"
roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h)
)
};
}
macro_rules! roundtail {
($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => {
c!(
// Part 0
"rotri.d $a6, " $e ", 18;"
"rotri.d $a7, " $e ", 41;"
"rotri.d $a4, " $e ", 14;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"xor $a6, " $g ", " $f ";"
"ld.d $a7, $a3, " $i " * 8;"
"and $a6, $a6, " $e ";"
"xor $a6, $a6, " $g ";"
"add.d $a4, $a4, $a6;"
"add.d $a4, $a4, $a7;"
"add.d " $h ", " $h ", $a5;"
"add.d " $h ", " $h ", $a4;"
// Part 1
"add.d " $d ", " $d ", " $h ";"
// Part 2
"rotri.d $a6, " $a ", 39;"
"rotri.d $a7, " $a ", 34;"
"rotri.d $a4, " $a ", 28;"
"xor $a6, $a6, $a7;"
"xor $a4, $a4, $a6;"
"add.d " $h ", " $h ", $a4;"
"or $a4, " $c ", " $b ";"
"and $a6, " $c ", " $b ";"
"and $a4, $a4, " $a ";"
"or $a4, $a4, $a6;"
"add.d " $h ", " $h ", $a4;"
"st.d $a5, $sp, ((" $i " & 0xF) * 8);"
)
};
}
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
if blocks.is_empty() {
return;
}
unsafe {
core::arch::asm!(
// Allocate scratch stack space
"addi.d $sp, $sp, -128;",
// Load state
"ld.d $t0, $a0, 0",
"ld.d $t1, $a0, 8",
"ld.d $t2, $a0, 16",
"ld.d $t3, $a0, 24",
"ld.d $t4, $a0, 32",
"ld.d $t5, $a0, 40",
"ld.d $t6, $a0, 48",
"ld.d $t7, $a0, 56",
"42:",
// Do 64 rounds of hashing
rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(64, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(65, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(66, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(67, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(68, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(69, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(70, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(71, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
roundb!(72, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"),
roundb!(73, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"),
roundb!(74, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"),
roundb!(75, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"),
roundb!(76, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"),
roundb!(77, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"),
roundb!(78, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"),
roundb!(79, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"),
// Update state registers
"ld.d $a4, $a0, 0", // a
"ld.d $a5, $a0, 8", // b
"ld.d $a6, $a0, 16", // c
"ld.d $a7, $a0, 24", // d
"add.d $t0, $t0, $a4",
"add.d $t1, $t1, $a5",
"add.d $t2, $t2, $a6",
"add.d $t3, $t3, $a7",
"ld.d $a4, $a0, 32", // e
"ld.d $a5, $a0, 40", // f
"ld.d $a6, $a0, 48", // g
"ld.d $a7, $a0, 56", // h
"add.d $t4, $t4, $a4",
"add.d $t5, $t5, $a5",
"add.d $t6, $t6, $a6",
"add.d $t7, $t7, $a7",
// Save updated state
"st.d $t0, $a0, 0",
"st.d $t1, $a0, 8",
"st.d $t2, $a0, 16",
"st.d $t3, $a0, 24",
"st.d $t4, $a0, 32",
"st.d $t5, $a0, 40",
"st.d $t6, $a0, 48",
"st.d $t7, $a0, 56",
// Looping over blocks
"addi.d $a1, $a1, 128",
"addi.d $a2, $a2, -1",
"bnez $a2, 42b",
// Restore stack register
"addi.d $sp, $sp, 128",
in("$a0") state,
inout("$a1") blocks.as_ptr() => _,
inout("$a2") blocks.len() => _,
in("$a3") crate::consts::K64.as_ptr(),
// Clobbers
out("$a4") _,
out("$a5") _,
out("$a6") _,
out("$a7") _,
out("$t0") _,
out("$t1") _,
out("$t2") _,
out("$t3") _,
out("$t4") _,
out("$t5") _,
out("$t6") _,
out("$t7") _,
options(preserves_flags),
);
}
}

215
vendor/sha2/src/sha512/soft.rs vendored Normal file
View File

@@ -0,0 +1,215 @@
#![allow(clippy::many_single_char_names)]
use crate::consts::{BLOCK_LEN, K64X2};
use core::convert::TryInto;
fn add(a: [u64; 2], b: [u64; 2]) -> [u64; 2] {
[a[0].wrapping_add(b[0]), a[1].wrapping_add(b[1])]
}
/// Not an intrinsic, but works like an unaligned load.
fn sha512load(v0: [u64; 2], v1: [u64; 2]) -> [u64; 2] {
[v1[1], v0[0]]
}
/// Performs 2 rounds of the SHA-512 message schedule update.
pub fn sha512_schedule_x2(v0: [u64; 2], v1: [u64; 2], v4to5: [u64; 2], v7: [u64; 2]) -> [u64; 2] {
// sigma 0
fn sigma0(x: u64) -> u64 {
((x << 63) | (x >> 1)) ^ ((x << 56) | (x >> 8)) ^ (x >> 7)
}
// sigma 1
fn sigma1(x: u64) -> u64 {
((x << 45) | (x >> 19)) ^ ((x << 3) | (x >> 61)) ^ (x >> 6)
}
let [w1, w0] = v0;
let [_, w2] = v1;
let [w10, w9] = v4to5;
let [w15, w14] = v7;
let w16 = sigma1(w14)
.wrapping_add(w9)
.wrapping_add(sigma0(w1))
.wrapping_add(w0);
let w17 = sigma1(w15)
.wrapping_add(w10)
.wrapping_add(sigma0(w2))
.wrapping_add(w1);
[w17, w16]
}
/// Performs one round of the SHA-512 message block digest.
pub fn sha512_digest_round(
ae: [u64; 2],
bf: [u64; 2],
cg: [u64; 2],
dh: [u64; 2],
wk0: u64,
) -> [u64; 2] {
macro_rules! big_sigma0 {
($a:expr) => {
($a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39))
};
}
macro_rules! big_sigma1 {
($a:expr) => {
($a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41))
};
}
macro_rules! bool3ary_202 {
($a:expr, $b:expr, $c:expr) => {
$c ^ ($a & ($b ^ $c))
};
} // Choose, MD5F, SHA1C
macro_rules! bool3ary_232 {
($a:expr, $b:expr, $c:expr) => {
($a & $b) ^ ($a & $c) ^ ($b & $c)
};
} // Majority, SHA1M
let [a0, e0] = ae;
let [b0, f0] = bf;
let [c0, g0] = cg;
let [d0, h0] = dh;
// a round
let x0 = big_sigma1!(e0)
.wrapping_add(bool3ary_202!(e0, f0, g0))
.wrapping_add(wk0)
.wrapping_add(h0);
let y0 = big_sigma0!(a0).wrapping_add(bool3ary_232!(a0, b0, c0));
let (a1, _, _, _, e1, _, _, _) = (
x0.wrapping_add(y0),
a0,
b0,
c0,
x0.wrapping_add(d0),
e0,
f0,
g0,
);
[a1, e1]
}
/// Process a block with the SHA-512 algorithm.
pub fn sha512_digest_block_u64(state: &mut [u64; 8], block: &[u64; 16]) {
let k = &K64X2;
macro_rules! schedule {
($v0:expr, $v1:expr, $v4:expr, $v5:expr, $v7:expr) => {
sha512_schedule_x2($v0, $v1, sha512load($v4, $v5), $v7)
};
}
macro_rules! rounds4 {
($ae:ident, $bf:ident, $cg:ident, $dh:ident, $wk0:expr, $wk1:expr) => {{
let [u, t] = $wk0;
let [w, v] = $wk1;
$dh = sha512_digest_round($ae, $bf, $cg, $dh, t);
$cg = sha512_digest_round($dh, $ae, $bf, $cg, u);
$bf = sha512_digest_round($cg, $dh, $ae, $bf, v);
$ae = sha512_digest_round($bf, $cg, $dh, $ae, w);
}};
}
let mut ae = [state[0], state[4]];
let mut bf = [state[1], state[5]];
let mut cg = [state[2], state[6]];
let mut dh = [state[3], state[7]];
// Rounds 0..20
let (mut w1, mut w0) = ([block[3], block[2]], [block[1], block[0]]);
rounds4!(ae, bf, cg, dh, add(k[0], w0), add(k[1], w1));
let (mut w3, mut w2) = ([block[7], block[6]], [block[5], block[4]]);
rounds4!(ae, bf, cg, dh, add(k[2], w2), add(k[3], w3));
let (mut w5, mut w4) = ([block[11], block[10]], [block[9], block[8]]);
rounds4!(ae, bf, cg, dh, add(k[4], w4), add(k[5], w5));
let (mut w7, mut w6) = ([block[15], block[14]], [block[13], block[12]]);
rounds4!(ae, bf, cg, dh, add(k[6], w6), add(k[7], w7));
let mut w8 = schedule!(w0, w1, w4, w5, w7);
let mut w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[8], w8), add(k[9], w9));
// Rounds 20..40
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[10], w0), add(k[11], w1));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[12], w2), add(k[13], w3));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[14], w4), add(k[15], w5));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[16], w6), add(k[17], w7));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[18], w8), add(k[19], w9));
// Rounds 40..60
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[20], w0), add(k[21], w1));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[22], w2), add(k[23], w3));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[24], w4), add(k[25], w5));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[26], w6), add(k[27], w7));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[28], w8), add(k[29], w9));
// Rounds 60..80
w0 = schedule!(w2, w3, w6, w7, w9);
w1 = schedule!(w3, w4, w7, w8, w0);
rounds4!(ae, bf, cg, dh, add(k[30], w0), add(k[31], w1));
w2 = schedule!(w4, w5, w8, w9, w1);
w3 = schedule!(w5, w6, w9, w0, w2);
rounds4!(ae, bf, cg, dh, add(k[32], w2), add(k[33], w3));
w4 = schedule!(w6, w7, w0, w1, w3);
w5 = schedule!(w7, w8, w1, w2, w4);
rounds4!(ae, bf, cg, dh, add(k[34], w4), add(k[35], w5));
w6 = schedule!(w8, w9, w2, w3, w5);
w7 = schedule!(w9, w0, w3, w4, w6);
rounds4!(ae, bf, cg, dh, add(k[36], w6), add(k[37], w7));
w8 = schedule!(w0, w1, w4, w5, w7);
w9 = schedule!(w1, w2, w5, w6, w8);
rounds4!(ae, bf, cg, dh, add(k[38], w8), add(k[39], w9));
let [a, e] = ae;
let [b, f] = bf;
let [c, g] = cg;
let [d, h] = dh;
state[0] = state[0].wrapping_add(a);
state[1] = state[1].wrapping_add(b);
state[2] = state[2].wrapping_add(c);
state[3] = state[3].wrapping_add(d);
state[4] = state[4].wrapping_add(e);
state[5] = state[5].wrapping_add(f);
state[6] = state[6].wrapping_add(g);
state[7] = state[7].wrapping_add(h);
}
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
let mut block_u32 = [0u64; BLOCK_LEN];
// since LLVM can't properly use aliasing yet it will make
// unnecessary state stores without this copy
let mut state_cpy = *state;
for block in blocks {
for (o, chunk) in block_u32.iter_mut().zip(block.chunks_exact(8)) {
*o = u64::from_be_bytes(chunk.try_into().unwrap());
}
sha512_digest_block_u64(&mut state_cpy, &block_u32);
}
*state = state_cpy;
}

66
vendor/sha2/src/sha512/soft_compact.rs vendored Normal file
View File

@@ -0,0 +1,66 @@
use crate::consts::K64;
fn to_u64s(block: &[u8; 128]) -> [u64; 16] {
use core::convert::TryInto;
let mut res = [0u64; 16];
for i in 0..16 {
let chunk = block[8 * i..][..8].try_into().unwrap();
res[i] = u64::from_be_bytes(chunk);
}
res
}
fn compress_u64(state: &mut [u64; 8], block: [u64; 16]) {
let [mut a, mut b, mut c, mut d, mut e, mut f, mut g, mut h] = *state;
let mut w = [0; 80];
w[..16].copy_from_slice(&block);
for i in 16..80 {
let w15 = w[i - 15];
let s0 = (w15.rotate_right(1)) ^ (w15.rotate_right(8)) ^ (w15 >> 7);
let w2 = w[i - 2];
let s1 = (w2.rotate_right(19)) ^ (w2.rotate_right(61)) ^ (w2 >> 6);
w[i] = w[i - 16]
.wrapping_add(s0)
.wrapping_add(w[i - 7])
.wrapping_add(s1);
}
for i in 0..80 {
let s1 = e.rotate_right(14) ^ e.rotate_right(18) ^ e.rotate_right(41);
let ch = (e & f) ^ ((!e) & g);
let t1 = s1
.wrapping_add(ch)
.wrapping_add(K64[i])
.wrapping_add(w[i])
.wrapping_add(h);
let s0 = a.rotate_right(28) ^ a.rotate_right(34) ^ a.rotate_right(39);
let maj = (a & b) ^ (a & c) ^ (b & c);
let t2 = s0.wrapping_add(maj);
h = g;
g = f;
f = e;
e = d.wrapping_add(t1);
d = c;
c = b;
b = a;
a = t1.wrapping_add(t2);
}
state[0] = state[0].wrapping_add(a);
state[1] = state[1].wrapping_add(b);
state[2] = state[2].wrapping_add(c);
state[3] = state[3].wrapping_add(d);
state[4] = state[4].wrapping_add(e);
state[5] = state[5].wrapping_add(f);
state[6] = state[6].wrapping_add(g);
state[7] = state[7].wrapping_add(h);
}
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
for block in blocks.iter() {
compress_u64(state, to_u64s(block));
}
}

357
vendor/sha2/src/sha512/x86.rs vendored Normal file
View File

@@ -0,0 +1,357 @@
//! SHA-512 `x86`/`x86_64` backend
#![allow(clippy::many_single_char_names)]
use core::mem::size_of;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::consts::K64;
cpufeatures::new!(avx2_cpuid, "avx2");
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
// after stabilization
if avx2_cpuid::get() {
unsafe {
sha512_compress_x86_64_avx2(state, blocks);
}
} else {
super::soft::compress(state, blocks);
}
}
#[target_feature(enable = "avx2")]
unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
let mut start_block = 0;
if blocks.len() & 0b1 != 0 {
sha512_compress_x86_64_avx(state, &blocks[0]);
start_block += 1;
}
let mut ms: MsgSchedule = [_mm_setzero_si128(); 8];
let mut t2: RoundStates = [_mm_setzero_si128(); 40];
let mut x = [_mm256_setzero_si256(); 8];
for i in (start_block..blocks.len()).step_by(2) {
load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _);
// First block
let mut current_state = *state;
rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2);
rounds_64_79(&mut current_state, &ms);
accumulate_state(state, &current_state);
// Second block
current_state = *state;
process_second_block(&mut current_state, &t2);
accumulate_state(state, &current_state);
}
}
#[inline(always)]
unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) {
let mut ms = [_mm_setzero_si128(); 8];
let mut x = [_mm_setzero_si128(); 8];
// Reduced to single iteration
let mut current_state = *state;
load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _);
rounds_0_63_avx(&mut current_state, &mut x, &mut ms);
rounds_64_79(&mut current_state, &ms);
accumulate_state(state, &current_state);
}
#[inline(always)]
unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) {
#[allow(non_snake_case)]
let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
macro_rules! unrolled_iterations {
($($i:literal),*) => {$(
x[$i] = _mm_loadu_si128(data.add($i) as *const _);
x[$i] = _mm_shuffle_epi8(x[$i], MASK);
let y = _mm_add_epi64(
x[$i],
_mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _),
);
ms[$i] = y;
)*};
}
unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
}
#[inline(always)]
unsafe fn load_data_avx2(
x: &mut [__m256i; 8],
ms: &mut MsgSchedule,
t2: &mut RoundStates,
data: *const __m128i,
) {
#[allow(non_snake_case)]
let MASK = _mm256_set_epi64x(
0x0809_0A0B_0C0D_0E0F_i64,
0x0001_0203_0405_0607_i64,
0x0809_0A0B_0C0D_0E0F_i64,
0x0001_0203_0405_0607_i64,
);
macro_rules! unrolled_iterations {
($($i:literal),*) => {$(
x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add(8 + $i) as *const _), 1);
x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 0);
x[$i] = _mm256_shuffle_epi8(x[$i], MASK);
let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _);
let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t));
ms[$i] = _mm256_extracti128_si256(y, 0);
t2[$i] = _mm256_extracti128_si256(y, 1);
)*};
}
unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
}
#[inline(always)]
unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) {
let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM;
for _ in 0..4 {
for j in 0..8 {
let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _);
let y = sha512_update_x_avx(x, k64);
{
let ms = cast_ms(ms);
sha_round(current_state, ms[2 * j]);
sha_round(current_state, ms[2 * j + 1]);
}
ms[j] = y;
k64_idx += 2;
}
}
}
#[inline(always)]
unsafe fn rounds_0_63_avx2(
current_state: &mut State,
x: &mut [__m256i; 8],
ms: &mut MsgSchedule,
t2: &mut RoundStates,
) {
let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM;
for i in 1..5 {
for j in 0..8 {
let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _);
let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t));
{
let ms = cast_ms(ms);
sha_round(current_state, ms[2 * j]);
sha_round(current_state, ms[2 * j + 1]);
}
ms[j] = _mm256_extracti128_si256(y, 0);
t2[8 * i + j] = _mm256_extracti128_si256(y, 1);
k64x4_idx += 2;
}
}
}
#[inline(always)]
fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) {
let ms = cast_ms(ms);
for i in 64..80 {
sha_round(current_state, ms[i & 0xf]);
}
}
#[inline(always)]
fn process_second_block(current_state: &mut State, t2: &RoundStates) {
for t2 in cast_rs(t2).iter() {
sha_round(current_state, *t2);
}
}
#[inline(always)]
fn sha_round(s: &mut State, x: u64) {
macro_rules! big_sigma0 {
($a:expr) => {
$a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39)
};
}
macro_rules! big_sigma1 {
($a:expr) => {
$a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41)
};
}
macro_rules! bool3ary_202 {
($a:expr, $b:expr, $c:expr) => {
$c ^ ($a & ($b ^ $c))
};
} // Choose, MD5F, SHA1C
macro_rules! bool3ary_232 {
($a:expr, $b:expr, $c:expr) => {
($a & $b) ^ ($a & $c) ^ ($b & $c)
};
} // Majority, SHA1M
macro_rules! rotate_state {
($s:ident) => {{
let tmp = $s[7];
$s[7] = $s[6];
$s[6] = $s[5];
$s[5] = $s[4];
$s[4] = $s[3];
$s[3] = $s[2];
$s[2] = $s[1];
$s[1] = $s[0];
$s[0] = tmp;
}};
}
let t = x
.wrapping_add(s[7])
.wrapping_add(big_sigma1!(s[4]))
.wrapping_add(bool3ary_202!(s[4], s[5], s[6]));
s[7] = t
.wrapping_add(big_sigma0!(s[0]))
.wrapping_add(bool3ary_232!(s[0], s[1], s[2]));
s[3] = s[3].wrapping_add(t);
rotate_state!(s);
}
#[inline(always)]
fn accumulate_state(dst: &mut State, src: &State) {
for i in 0..SHA512_HASH_WORDS_NUM {
dst[i] = dst[i].wrapping_add(src[i]);
}
}
macro_rules! fn_sha512_update_x {
($name:ident, $ty:ident, {
ADD64 = $ADD64:ident,
ALIGNR8 = $ALIGNR8:ident,
SRL64 = $SRL64:ident,
SLL64 = $SLL64:ident,
XOR = $XOR:ident,
}) => {
unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty {
// q[2:1]
let mut t0 = $ALIGNR8(x[1], x[0], 8);
// q[10:9]
let mut t3 = $ALIGNR8(x[5], x[4], 8);
// q[2:1] >> s0[0]
let mut t2 = $SRL64(t0, 1);
// q[1:0] + q[10:9]
x[0] = $ADD64(x[0], t3);
// q[2:1] >> s0[2]
t3 = $SRL64(t0, 7);
// q[2:1] << (64 - s0[1])
let mut t1 = $SLL64(t0, 64 - 8);
// (q[2:1] >> s0[2]) ^
// (q[2:1] >> s0[0])
t0 = $XOR(t3, t2);
// q[2:1] >> s0[1]
t2 = $SRL64(t2, 8 - 1);
// (q[2:1] >> s0[2]) ^
// (q[2:1] >> s0[0]) ^
// q[2:1] << (64 - s0[1])
t0 = $XOR(t0, t1);
// q[2:1] << (64 - s0[0])
t1 = $SLL64(t1, 8 - 1);
// sigma1(q[2:1])
t0 = $XOR(t0, t2);
t0 = $XOR(t0, t1);
// q[15:14] >> s1[2]
t3 = $SRL64(x[7], 6);
// q[15:14] >> (64 - s1[1])
t2 = $SLL64(x[7], 64 - 61);
// q[1:0] + sigma0(q[2:1])
x[0] = $ADD64(x[0], t0);
// q[15:14] >> s1[0]
t1 = $SRL64(x[7], 19);
// q[15:14] >> s1[2] ^
// q[15:14] >> (64 - s1[1])
t3 = $XOR(t3, t2);
// q[15:14] >> (64 - s1[0])
t2 = $SLL64(t2, 61 - 19);
// q[15:14] >> s1[2] ^
// q[15:14] >> (64 - s1[1] ^
// q[15:14] >> s1[0]
t3 = $XOR(t3, t1);
// q[15:14] >> s1[1]
t1 = $SRL64(t1, 61 - 19);
// sigma1(q[15:14])
t3 = $XOR(t3, t2);
t3 = $XOR(t3, t1);
// q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1])
x[0] = $ADD64(x[0], t3);
// rotate
let temp = x[0];
x[0] = x[1];
x[1] = x[2];
x[2] = x[3];
x[3] = x[4];
x[4] = x[5];
x[5] = x[6];
x[6] = x[7];
x[7] = temp;
$ADD64(x[7], k64)
}
};
}
fn_sha512_update_x!(sha512_update_x_avx, __m128i, {
ADD64 = _mm_add_epi64,
ALIGNR8 = _mm_alignr_epi8,
SRL64 = _mm_srli_epi64,
SLL64 = _mm_slli_epi64,
XOR = _mm_xor_si128,
});
fn_sha512_update_x!(sha512_update_x_avx2, __m256i, {
ADD64 = _mm256_add_epi64,
ALIGNR8 = _mm256_alignr_epi8,
SRL64 = _mm256_srli_epi64,
SLL64 = _mm256_slli_epi64,
XOR = _mm256_xor_si256,
});
#[inline(always)]
fn cast_ms(ms: &MsgSchedule) -> &[u64; SHA512_BLOCK_WORDS_NUM] {
unsafe { &*(ms as *const MsgSchedule as *const _) }
}
#[inline(always)]
fn cast_rs(rs: &RoundStates) -> &[u64; SHA512_ROUNDS_NUM] {
unsafe { &*(rs as *const RoundStates as *const _) }
}
type State = [u64; SHA512_HASH_WORDS_NUM];
type MsgSchedule = [__m128i; SHA512_BLOCK_WORDS_NUM / 2];
type RoundStates = [__m128i; SHA512_ROUNDS_NUM / 2];
const SHA512_BLOCK_BYTE_LEN: usize = 128;
const SHA512_ROUNDS_NUM: usize = 80;
const SHA512_HASH_BYTE_LEN: usize = 64;
const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::<u64>();
const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::<u64>();