chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

132
vendor/utf8parse/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,132 @@
//! A table-driven UTF-8 Parser
//!
//! This module implements a table-driven UTF-8 parser which should
//! theoretically contain the minimal number of branches (1). The only branch is
//! on the `Action` returned from unpacking a transition.
#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)]
#![cfg_attr(all(feature = "nightly", test), feature(test))]
#![no_std]
use core::char;
mod types;
use types::{Action, State};
/// Handles codepoint and invalid sequence events from the parser.
pub trait Receiver {
/// Called whenever a codepoint is parsed successfully
fn codepoint(&mut self, _: char);
/// Called when an invalid_sequence is detected
fn invalid_sequence(&mut self);
}
/// A parser for Utf8 Characters
///
/// Repeatedly call `advance` with bytes to emit Utf8 characters
#[derive(Clone, Default, PartialEq, Eq, Debug)]
pub struct Parser {
point: u32,
state: State,
}
/// Continuation bytes are masked with this value.
const CONTINUATION_MASK: u8 = 0b0011_1111;
impl Parser {
/// Create a new Parser
pub fn new() -> Parser {
Parser { point: 0, state: State::Ground }
}
/// Advance the parser
///
/// The provider receiver will be called whenever a codepoint is completed or an invalid
/// sequence is detected.
pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
where
R: Receiver,
{
let (state, action) = self.state.advance(byte);
self.perform_action(receiver, byte, action);
self.state = state;
}
fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
where
R: Receiver,
{
match action {
Action::InvalidSequence => {
self.point = 0;
receiver.invalid_sequence();
},
Action::EmitByte => {
receiver.codepoint(byte as char);
},
Action::SetByte1 => {
let point = self.point | ((byte & CONTINUATION_MASK) as u32);
let c = unsafe { char::from_u32_unchecked(point) };
self.point = 0;
receiver.codepoint(c);
},
Action::SetByte2 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
},
Action::SetByte2Top => {
self.point |= ((byte & 0b0001_1111) as u32) << 6;
},
Action::SetByte3 => {
self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
},
Action::SetByte3Top => {
self.point |= ((byte & 0b0000_1111) as u32) << 12;
},
Action::SetByte4 => {
self.point |= ((byte & 0b0000_0111) as u32) << 18;
},
}
}
}
#[cfg(all(feature = "nightly", test))]
mod benches {
extern crate std;
extern crate test;
use super::{Parser, Receiver};
use self::test::{black_box, Bencher};
static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt");
impl Receiver for () {
fn codepoint(&mut self, c: char) {
black_box(c);
}
fn invalid_sequence(&mut self) {}
}
#[bench]
fn parse_bench_utf8_demo(b: &mut Bencher) {
let mut parser = Parser::new();
b.iter(|| {
for byte in UTF8_DEMO {
parser.advance(&mut (), *byte);
}
})
}
#[bench]
fn std_string_parse_utf8(b: &mut Bencher) {
b.iter(|| {
for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() {
black_box(c);
}
});
}
}

100
vendor/utf8parse/src/types.rs vendored Normal file
View File

@@ -0,0 +1,100 @@
//! Types supporting the UTF-8 parser
/// Action to take when receiving a byte
#[derive(Debug, Copy, Clone)]
pub enum Action {
/// Unexpected byte; sequence is invalid
InvalidSequence = 0,
/// Received valid 7-bit ASCII byte which can be directly emitted.
EmitByte = 1,
/// Set the bottom continuation byte
SetByte1 = 2,
/// Set the 2nd-from-last continuation byte
SetByte2 = 3,
/// Set the 2nd-from-last byte which is part of a two byte sequence
SetByte2Top = 4,
/// Set the 3rd-from-last continuation byte
SetByte3 = 5,
/// Set the 3rd-from-last byte which is part of a three byte sequence
SetByte3Top = 6,
/// Set the top byte of a four byte sequence.
SetByte4 = 7,
}
/// States the parser can be in.
///
/// There is a state for each initial input of the 3 and 4 byte sequences since
/// the following bytes are subject to different conditions than a tail byte.
#[allow(non_camel_case_types)]
#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)]
pub enum State {
/// Ground state; expect anything
#[default]
Ground = 0,
/// 3 tail bytes
Tail3 = 1,
/// 2 tail bytes
Tail2 = 2,
/// 1 tail byte
Tail1 = 3,
/// UTF8-3 starting with E0
U3_2_e0 = 4,
/// UTF8-3 starting with ED
U3_2_ed = 5,
/// UTF8-4 starting with F0
Utf8_4_3_f0 = 6,
/// UTF8-4 starting with F4
Utf8_4_3_f4 = 7,
}
impl State {
/// Advance the parser state.
///
/// This takes the current state and input byte into consideration, to determine the next state
/// and any action that should be taken.
#[inline]
pub fn advance(self, byte: u8) -> (State, Action) {
match self {
State::Ground => match byte {
0x00..=0x7f => (State::Ground, Action::EmitByte),
0xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
0xe0 => (State::U3_2_e0, Action::SetByte3Top),
0xe1..=0xec => (State::Tail2, Action::SetByte3Top),
0xed => (State::U3_2_ed, Action::SetByte3Top),
0xee..=0xef => (State::Tail2, Action::SetByte3Top),
0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
0xf1..=0xf3 => (State::Tail3, Action::SetByte4),
0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
_ => (State::Ground, Action::InvalidSequence),
},
State::U3_2_e0 => match byte {
0xa0..=0xbf => (State::Tail1, Action::SetByte2),
_ => (State::Ground, Action::InvalidSequence),
},
State::U3_2_ed => match byte {
0x80..=0x9f => (State::Tail1, Action::SetByte2),
_ => (State::Ground, Action::InvalidSequence),
},
State::Utf8_4_3_f0 => match byte {
0x90..=0xbf => (State::Tail2, Action::SetByte3),
_ => (State::Ground, Action::InvalidSequence),
},
State::Utf8_4_3_f4 => match byte {
0x80..=0x8f => (State::Tail2, Action::SetByte3),
_ => (State::Ground, Action::InvalidSequence),
},
State::Tail3 => match byte {
0x80..=0xbf => (State::Tail2, Action::SetByte3),
_ => (State::Ground, Action::InvalidSequence),
},
State::Tail2 => match byte {
0x80..=0xbf => (State::Tail1, Action::SetByte2),
_ => (State::Ground, Action::InvalidSequence),
},
State::Tail1 => match byte {
0x80..=0xbf => (State::Ground, Action::SetByte1),
_ => (State::Ground, Action::InvalidSequence),
},
}
}
}