chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,105 @@
//! Autodetection support for AVX2 CPU intrinsics on x86 CPUs, with fallback
//! to the "soft" backend when it's unavailable.
use universal_hash::{consts::U16, crypto_common::BlockSizeUser, UniversalHash};
use crate::{backend, Block, Key, Tag};
use core::mem::ManuallyDrop;
cpufeatures::new!(avx2_cpuid, "avx2");
pub struct State {
inner: Inner,
token: avx2_cpuid::InitToken,
}
union Inner {
avx2: ManuallyDrop<backend::avx2::State>,
soft: ManuallyDrop<backend::soft::State>,
}
impl BlockSizeUser for State {
type BlockSize = U16;
}
impl State {
/// Initialize Poly1305 [`State`] with the given key
#[inline]
pub(crate) fn new(key: &Key) -> State {
let (token, avx2_present) = avx2_cpuid::init_get();
let inner = if avx2_present {
Inner {
avx2: ManuallyDrop::new(backend::avx2::State::new(key)),
}
} else {
Inner {
soft: ManuallyDrop::new(backend::soft::State::new(key)),
}
};
Self { inner, token }
}
/// Compute a Poly1305 block
#[inline]
pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) {
if self.token.get() {
unsafe { (*self.inner.avx2).compute_block(block, partial) }
} else {
unsafe { (*self.inner.soft).compute_block(block, partial) }
}
}
}
impl UniversalHash for State {
fn update_with_backend(
&mut self,
f: impl universal_hash::UhfClosure<BlockSize = Self::BlockSize>,
) {
if self.token.get() {
unsafe { f.call(&mut *self.inner.avx2) }
} else {
unsafe { f.call(&mut *self.inner.soft) }
}
}
/// Finalize output producing a [`Tag`]
#[inline]
fn finalize(mut self) -> Tag {
if self.token.get() {
unsafe { (*self.inner.avx2).finalize() }
} else {
unsafe { (*self.inner.soft).finalize_mut() }
}
}
}
impl Clone for State {
fn clone(&self) -> Self {
let inner = if self.token.get() {
Inner {
avx2: ManuallyDrop::new(unsafe { (*self.inner.avx2).clone() }),
}
} else {
Inner {
soft: ManuallyDrop::new(unsafe { (*self.inner.soft).clone() }),
}
};
Self {
inner,
token: self.token,
}
}
}
#[cfg(feature = "zeroize")]
impl Drop for State {
fn drop(&mut self) {
use zeroize::Zeroize;
const SIZE: usize = core::mem::size_of::<State>();
let state = unsafe { &mut *(self as *mut State as *mut [u8; SIZE]) };
state.zeroize();
}
}

209
vendor/poly1305/src/backend/avx2.rs vendored Normal file
View File

@@ -0,0 +1,209 @@
//! AVX2 implementation of the Poly1305 state machine.
// The State struct and its logic was originally derived from Goll and Gueron's AVX2 C
// code:
// [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463)
//
// which was sourced from Bhattacharyya and Sarkar's modified variant:
// [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842)
// https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305
//
// The logic has been extensively rewritten and documented, and several bugs in the
// original C code were fixed.
//
// Note that State only implements the original Goll-Gueron algorithm, not the
// optimisations provided by Bhattacharyya and Sarkar. The latter require the message
// length to be known, which is incompatible with the streaming API of UniversalHash.
use universal_hash::{
consts::{U16, U4},
crypto_common::{BlockSizeUser, ParBlocksSizeUser},
generic_array::GenericArray,
UhfBackend,
};
use crate::{Block, Key, Tag};
mod helpers;
use self::helpers::*;
/// Four Poly1305 blocks (64-bytes)
type ParBlocks = universal_hash::ParBlocks<State>;
#[derive(Copy, Clone)]
struct Initialized {
p: Aligned4x130,
m: SpacedMultiplier4x130,
r4: PrecomputedMultiplier,
}
#[derive(Clone)]
pub(crate) struct State {
k: AdditionKey,
r1: PrecomputedMultiplier,
r2: PrecomputedMultiplier,
initialized: Option<Initialized>,
cached_blocks: [Block; 4],
num_cached_blocks: usize,
partial_block: Option<Block>,
}
impl State {
/// Initialize Poly1305 [`State`] with the given key
pub(crate) fn new(key: &Key) -> Self {
// Prepare addition key and polynomial key.
let (k, r1) = unsafe { prepare_keys(key) };
// Precompute R^2.
let r2 = (r1 * r1).reduce();
State {
k,
r1,
r2: r2.into(),
initialized: None,
cached_blocks: [Block::default(); 4],
num_cached_blocks: 0,
partial_block: None,
}
}
/// Process four Poly1305 blocks at once.
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn compute_par_blocks(&mut self, blocks: &ParBlocks) {
assert!(self.partial_block.is_none());
assert_eq!(self.num_cached_blocks, 0);
self.process_blocks(Aligned4x130::from_par_blocks(blocks));
}
/// Compute a Poly1305 block
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn compute_block(&mut self, block: &Block, partial: bool) {
// We can cache a single partial block.
if partial {
assert!(self.partial_block.is_none());
self.partial_block = Some(*block);
return;
}
self.cached_blocks[self.num_cached_blocks].copy_from_slice(block);
if self.num_cached_blocks < 3 {
self.num_cached_blocks += 1;
return;
} else {
self.num_cached_blocks = 0;
}
self.process_blocks(Aligned4x130::from_blocks(&self.cached_blocks));
}
/// Compute a Poly1305 block
#[target_feature(enable = "avx2")]
unsafe fn process_blocks(&mut self, blocks: Aligned4x130) {
if let Some(inner) = &mut self.initialized {
// P <-- R^4 * P + blocks
inner.p = (&inner.p * inner.r4).reduce() + blocks;
} else {
// Initialize the polynomial.
let p = blocks;
// Initialize the multiplier (used to merge down the polynomial during
// finalization).
let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2);
self.initialized = Some(Initialized { p, m, r4 })
}
}
/// Finalize output producing a [`Tag`]
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn finalize(&mut self) -> Tag {
assert!(self.num_cached_blocks < 4);
let mut data = &self.cached_blocks[..];
// T ← R◦T
// P = T_0 + T_1 + T_2 + T_3
let mut p = self
.initialized
.take()
.map(|inner| (inner.p * inner.m).sum().reduce());
if self.num_cached_blocks >= 2 {
// Compute 32 byte block (remaining data < 64 bytes)
let mut c = Aligned2x130::from_blocks(data[..2].try_into().unwrap());
if let Some(p) = p {
c = c + p;
}
p = Some(c.mul_and_sum(self.r1, self.r2).reduce());
data = &data[2..];
self.num_cached_blocks -= 2;
}
if self.num_cached_blocks == 1 {
// Compute 16 byte block (remaining data < 32 bytes)
let mut c = Aligned130::from_block(&data[0]);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
self.num_cached_blocks -= 1;
}
if let Some(block) = &self.partial_block {
// Compute last block (remaining data < 16 bytes)
let mut c = Aligned130::from_partial_block(block);
if let Some(p) = p {
c = c + p;
}
p = Some((c * self.r1).reduce());
}
// Compute tag: p + k mod 2^128
let mut tag = GenericArray::<u8, _>::default();
let tag_int = if let Some(p) = p {
self.k + p
} else {
self.k.into()
};
tag_int.write(tag.as_mut_slice());
tag
}
}
impl BlockSizeUser for State {
type BlockSize = U16;
}
impl ParBlocksSizeUser for State {
type ParBlocksSize = U4;
}
impl UhfBackend for State {
fn proc_block(&mut self, block: &Block) {
unsafe { self.compute_block(block, false) };
}
fn proc_par_blocks(&mut self, blocks: &ParBlocks) {
if self.num_cached_blocks == 0 {
// Fast path.
unsafe { self.compute_par_blocks(blocks) };
} else {
// We are unaligned; use the slow fallback.
for block in blocks {
self.proc_block(block);
}
}
}
fn blocks_needed_to_align(&self) -> usize {
if self.num_cached_blocks == 0 {
// There are no cached blocks; fast path is available.
0
} else {
// There are cached blocks; report how many more we need.
self.cached_blocks.len() - self.num_cached_blocks
}
}
}

File diff suppressed because it is too large Load Diff

271
vendor/poly1305/src/backend/soft.rs vendored Normal file
View File

@@ -0,0 +1,271 @@
//! Software implementation of the Poly1305 state machine.
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// This code originates from the rust-crypto project:
// <https://github.com/DaGenix/rust-crypto>
//
// ...and was originally a port of Andrew Moons poly1305-donna
// https://github.com/floodyberry/poly1305-donna
use universal_hash::{
consts::{U1, U16},
crypto_common::{BlockSizeUser, ParBlocksSizeUser},
UhfBackend, UniversalHash,
};
use crate::{Block, Key, Tag};
#[derive(Clone, Default)]
pub(crate) struct State {
r: [u32; 5],
h: [u32; 5],
pad: [u32; 4],
}
impl State {
/// Initialize Poly1305 [`State`] with the given key
pub(crate) fn new(key: &Key) -> State {
let mut poly = State::default();
// r &= 0xffffffc0ffffffc0ffffffc0fffffff
poly.r[0] = (u32::from_le_bytes(key[0..4].try_into().unwrap())) & 0x3ff_ffff;
poly.r[1] = (u32::from_le_bytes(key[3..7].try_into().unwrap()) >> 2) & 0x3ff_ff03;
poly.r[2] = (u32::from_le_bytes(key[6..10].try_into().unwrap()) >> 4) & 0x3ff_c0ff;
poly.r[3] = (u32::from_le_bytes(key[9..13].try_into().unwrap()) >> 6) & 0x3f0_3fff;
poly.r[4] = (u32::from_le_bytes(key[12..16].try_into().unwrap()) >> 8) & 0x00f_ffff;
poly.pad[0] = u32::from_le_bytes(key[16..20].try_into().unwrap());
poly.pad[1] = u32::from_le_bytes(key[20..24].try_into().unwrap());
poly.pad[2] = u32::from_le_bytes(key[24..28].try_into().unwrap());
poly.pad[3] = u32::from_le_bytes(key[28..32].try_into().unwrap());
poly
}
/// Compute a Poly1305 block
pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) {
let hibit = if partial { 0 } else { 1 << 24 };
let r0 = self.r[0];
let r1 = self.r[1];
let r2 = self.r[2];
let r3 = self.r[3];
let r4 = self.r[4];
let s1 = r1 * 5;
let s2 = r2 * 5;
let s3 = r3 * 5;
let s4 = r4 * 5;
let mut h0 = self.h[0];
let mut h1 = self.h[1];
let mut h2 = self.h[2];
let mut h3 = self.h[3];
let mut h4 = self.h[4];
// h += m
h0 += (u32::from_le_bytes(block[0..4].try_into().unwrap())) & 0x3ff_ffff;
h1 += (u32::from_le_bytes(block[3..7].try_into().unwrap()) >> 2) & 0x3ff_ffff;
h2 += (u32::from_le_bytes(block[6..10].try_into().unwrap()) >> 4) & 0x3ff_ffff;
h3 += (u32::from_le_bytes(block[9..13].try_into().unwrap()) >> 6) & 0x3ff_ffff;
h4 += (u32::from_le_bytes(block[12..16].try_into().unwrap()) >> 8) | hibit;
// h *= r
let d0 = (u64::from(h0) * u64::from(r0))
+ (u64::from(h1) * u64::from(s4))
+ (u64::from(h2) * u64::from(s3))
+ (u64::from(h3) * u64::from(s2))
+ (u64::from(h4) * u64::from(s1));
let mut d1 = (u64::from(h0) * u64::from(r1))
+ (u64::from(h1) * u64::from(r0))
+ (u64::from(h2) * u64::from(s4))
+ (u64::from(h3) * u64::from(s3))
+ (u64::from(h4) * u64::from(s2));
let mut d2 = (u64::from(h0) * u64::from(r2))
+ (u64::from(h1) * u64::from(r1))
+ (u64::from(h2) * u64::from(r0))
+ (u64::from(h3) * u64::from(s4))
+ (u64::from(h4) * u64::from(s3));
let mut d3 = (u64::from(h0) * u64::from(r3))
+ (u64::from(h1) * u64::from(r2))
+ (u64::from(h2) * u64::from(r1))
+ (u64::from(h3) * u64::from(r0))
+ (u64::from(h4) * u64::from(s4));
let mut d4 = (u64::from(h0) * u64::from(r4))
+ (u64::from(h1) * u64::from(r3))
+ (u64::from(h2) * u64::from(r2))
+ (u64::from(h3) * u64::from(r1))
+ (u64::from(h4) * u64::from(r0));
// (partial) h %= p
let mut c: u32;
c = (d0 >> 26) as u32;
h0 = d0 as u32 & 0x3ff_ffff;
d1 += u64::from(c);
c = (d1 >> 26) as u32;
h1 = d1 as u32 & 0x3ff_ffff;
d2 += u64::from(c);
c = (d2 >> 26) as u32;
h2 = d2 as u32 & 0x3ff_ffff;
d3 += u64::from(c);
c = (d3 >> 26) as u32;
h3 = d3 as u32 & 0x3ff_ffff;
d4 += u64::from(c);
c = (d4 >> 26) as u32;
h4 = d4 as u32 & 0x3ff_ffff;
h0 += c * 5;
c = h0 >> 26;
h0 &= 0x3ff_ffff;
h1 += c;
self.h[0] = h0;
self.h[1] = h1;
self.h[2] = h2;
self.h[3] = h3;
self.h[4] = h4;
}
/// Finalize output producing a [`Tag`]
pub(crate) fn finalize_mut(&mut self) -> Tag {
// fully carry h
let mut h0 = self.h[0];
let mut h1 = self.h[1];
let mut h2 = self.h[2];
let mut h3 = self.h[3];
let mut h4 = self.h[4];
let mut c: u32;
c = h1 >> 26;
h1 &= 0x3ff_ffff;
h2 += c;
c = h2 >> 26;
h2 &= 0x3ff_ffff;
h3 += c;
c = h3 >> 26;
h3 &= 0x3ff_ffff;
h4 += c;
c = h4 >> 26;
h4 &= 0x3ff_ffff;
h0 += c * 5;
c = h0 >> 26;
h0 &= 0x3ff_ffff;
h1 += c;
// compute h + -p
let mut g0 = h0.wrapping_add(5);
c = g0 >> 26;
g0 &= 0x3ff_ffff;
let mut g1 = h1.wrapping_add(c);
c = g1 >> 26;
g1 &= 0x3ff_ffff;
let mut g2 = h2.wrapping_add(c);
c = g2 >> 26;
g2 &= 0x3ff_ffff;
let mut g3 = h3.wrapping_add(c);
c = g3 >> 26;
g3 &= 0x3ff_ffff;
let mut g4 = h4.wrapping_add(c).wrapping_sub(1 << 26);
// select h if h < p, or h + -p if h >= p
let mut mask = (g4 >> (32 - 1)).wrapping_sub(1);
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
g4 &= mask;
mask = !mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
h2 = (h2 & mask) | g2;
h3 = (h3 & mask) | g3;
h4 = (h4 & mask) | g4;
// h = h % (2^128)
h0 |= h1 << 26;
h1 = (h1 >> 6) | (h2 << 20);
h2 = (h2 >> 12) | (h3 << 14);
h3 = (h3 >> 18) | (h4 << 8);
// h = mac = (h + pad) % (2^128)
let mut f: u64;
f = u64::from(h0) + u64::from(self.pad[0]);
h0 = f as u32;
f = u64::from(h1) + u64::from(self.pad[1]) + (f >> 32);
h1 = f as u32;
f = u64::from(h2) + u64::from(self.pad[2]) + (f >> 32);
h2 = f as u32;
f = u64::from(h3) + u64::from(self.pad[3]) + (f >> 32);
h3 = f as u32;
let mut tag = Block::default();
tag[0..4].copy_from_slice(&h0.to_le_bytes());
tag[4..8].copy_from_slice(&h1.to_le_bytes());
tag[8..12].copy_from_slice(&h2.to_le_bytes());
tag[12..16].copy_from_slice(&h3.to_le_bytes());
tag
}
}
#[cfg(feature = "zeroize")]
impl Drop for State {
fn drop(&mut self) {
use zeroize::Zeroize;
self.r.zeroize();
self.h.zeroize();
self.pad.zeroize();
}
}
impl BlockSizeUser for State {
type BlockSize = U16;
}
impl ParBlocksSizeUser for State {
type ParBlocksSize = U1;
}
impl UhfBackend for State {
fn proc_block(&mut self, block: &Block) {
self.compute_block(block, false);
}
}
impl UniversalHash for State {
fn update_with_backend(
&mut self,
f: impl universal_hash::UhfClosure<BlockSize = Self::BlockSize>,
) {
f.call(self);
}
/// Finalize output producing a [`Tag`]
fn finalize(mut self) -> Tag {
self.finalize_mut()
}
}