chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

186
vendor/utf-8/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,186 @@
mod lossy;
mod read;
pub use lossy::LossyDecoder;
pub use read::{BufReadDecoder, BufReadDecoderError};
use std::cmp;
use std::error::Error;
use std::fmt;
use std::str;
/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
#[derive(Debug, Copy, Clone)]
pub enum DecodeError<'a> {
/// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
/// then call `decode()` again with `remaining_input`.
Invalid {
valid_prefix: &'a str,
invalid_sequence: &'a [u8],
remaining_input: &'a [u8],
},
/// Call the `incomplete_suffix.try_complete` method with more input when available.
/// If no more input is available, this is an invalid byte sequence.
Incomplete {
valid_prefix: &'a str,
incomplete_suffix: Incomplete,
},
}
impl<'a> fmt::Display for DecodeError<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
DecodeError::Invalid {
valid_prefix,
invalid_sequence,
remaining_input,
} => write!(
f,
"found invalid byte sequence {invalid_sequence:02x?} after \
{valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
unprocessed bytes",
invalid_sequence = invalid_sequence,
valid_byte_count = valid_prefix.len(),
unprocessed_byte_count = remaining_input.len()
),
DecodeError::Incomplete {
valid_prefix,
incomplete_suffix,
} => write!(
f,
"found incomplete byte sequence {incomplete_suffix:02x?} after \
{valid_byte_count} bytes",
incomplete_suffix = incomplete_suffix,
valid_byte_count = valid_prefix.len()
),
}
}
}
impl<'a> Error for DecodeError<'a> {}
#[derive(Debug, Copy, Clone)]
pub struct Incomplete {
pub buffer: [u8; 4],
pub buffer_len: u8,
}
pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
let error = match str::from_utf8(input) {
Ok(valid) => return Ok(valid),
Err(error) => error,
};
// FIXME: separate function from here to guide inlining?
let (valid, after_valid) = input.split_at(error.valid_up_to());
let valid = unsafe {
str::from_utf8_unchecked(valid)
};
match error.error_len() {
Some(invalid_sequence_length) => {
let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
Err(DecodeError::Invalid {
valid_prefix: valid,
invalid_sequence: invalid,
remaining_input: rest
})
}
None => {
Err(DecodeError::Incomplete {
valid_prefix: valid,
incomplete_suffix: Incomplete::new(after_valid),
})
}
}
}
impl Incomplete {
pub fn empty() -> Self {
Incomplete {
buffer: [0, 0, 0, 0],
buffer_len: 0,
}
}
pub fn is_empty(&self) -> bool {
self.buffer_len == 0
}
pub fn new(bytes: &[u8]) -> Self {
let mut buffer = [0, 0, 0, 0];
let len = bytes.len();
buffer[..len].copy_from_slice(bytes);
Incomplete {
buffer: buffer,
buffer_len: len as u8,
}
}
/// * `None`: still incomplete, call `try_complete` again with more input.
/// If no more input is available, this is invalid byte sequence.
/// * `Some((result, remaining_input))`: Were done with this `Incomplete`.
/// To keep decoding, pass `remaining_input` to `decode()`.
pub fn try_complete<'input>(&mut self, input: &'input [u8])
-> Option<(Result<&str, &[u8]>, &'input [u8])> {
let (consumed, opt_result) = self.try_complete_offsets(input);
let result = opt_result?;
let remaining_input = &input[consumed..];
let result_bytes = self.take_buffer();
let result = match result {
Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
Err(()) => Err(result_bytes),
};
Some((result, remaining_input))
}
fn take_buffer(&mut self) -> &[u8] {
let len = self.buffer_len as usize;
self.buffer_len = 0;
&self.buffer[..len as usize]
}
/// (consumed_from_input, None): not enough input
/// (consumed_from_input, Some(Err(()))): error bytes in buffer
/// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
let initial_buffer_len = self.buffer_len as usize;
let copied_from_input;
{
let unwritten = &mut self.buffer[initial_buffer_len..];
copied_from_input = cmp::min(unwritten.len(), input.len());
unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
}
let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
match str::from_utf8(spliced) {
Ok(_) => {
self.buffer_len = spliced.len() as u8;
(copied_from_input, Some(Ok(())))
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
self.buffer_len = valid_up_to as u8;
(consumed, Some(Ok(())))
} else {
match error.error_len() {
Some(invalid_sequence_length) => {
let consumed = invalid_sequence_length
.checked_sub(initial_buffer_len).unwrap();
self.buffer_len = invalid_sequence_length as u8;
(consumed, Some(Err(())))
}
None => {
self.buffer_len = spliced.len() as u8;
(copied_from_input, None)
}
}
}
}
}
}
}

92
vendor/utf-8/src/lossy.rs vendored Normal file
View File

@@ -0,0 +1,92 @@
use super::*;
/// A push-based, lossy decoder for UTF-8.
/// Errors are replaced with the U+FFFD replacement character.
///
/// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback.
///
/// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`)
/// can be rewritten as:
///
/// ```rust
/// fn string_from_utf8_lossy(input: &[u8]) -> String {
/// let mut string = String::new();
/// utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input);
/// string
/// }
/// ```
///
/// **Note:** Dropping the decoder signals the end of the input:
/// If the last input chunk ended with an incomplete byte sequence for a code point,
/// this is an error and a replacement character is emitted.
/// Use `std::mem::forget` to inhibit this behavior.
pub struct LossyDecoder<F: FnMut(&str)> {
push_str: F,
incomplete: Incomplete,
}
impl<F: FnMut(&str)> LossyDecoder<F> {
/// Create a new decoder from a callback.
#[inline]
pub fn new(push_str: F) -> Self {
LossyDecoder {
push_str: push_str,
incomplete: Incomplete {
buffer: [0, 0, 0, 0],
buffer_len: 0,
},
}
}
/// Feed one chunk of input into the decoder.
///
/// The input is decoded lossily
/// and the callback called once or more with `&str` string slices.
///
/// If the UTF-8 byte sequence for one code point was split into this bytes chunk
/// and previous bytes chunks, it will be correctly pieced back together.
pub fn feed(&mut self, mut input: &[u8]) {
if self.incomplete.buffer_len > 0 {
match self.incomplete.try_complete(input) {
Some((Ok(s), remaining)) => {
(self.push_str)(s);
input = remaining
}
Some((Err(_), remaining)) => {
(self.push_str)(REPLACEMENT_CHARACTER);
input = remaining
}
None => {
return
}
}
}
loop {
match decode(input) {
Ok(s) => {
(self.push_str)(s);
return
}
Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
(self.push_str)(valid_prefix);
self.incomplete = incomplete_suffix;
return
}
Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => {
(self.push_str)(valid_prefix);
(self.push_str)(REPLACEMENT_CHARACTER);
input = remaining_input
}
}
}
}
}
impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
#[inline]
fn drop(&mut self) {
if self.incomplete.buffer_len > 0 {
(self.push_str)(REPLACEMENT_CHARACTER)
}
}
}

167
vendor/utf-8/src/read.rs vendored Normal file
View File

@@ -0,0 +1,167 @@
use std::io::{self, BufRead};
use std::error::Error;
use std::fmt;
use std::str;
use super::*;
/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
pub struct BufReadDecoder<B: BufRead> {
buf_read: B,
bytes_consumed: usize,
incomplete: Incomplete,
}
#[derive(Debug)]
pub enum BufReadDecoderError<'a> {
/// Represents one UTF-8 error in the byte stream.
///
/// In lossy decoding, each such error should be replaced with U+FFFD.
/// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
InvalidByteSequence(&'a [u8]),
/// An I/O error from the underlying byte stream
Io(io::Error),
}
impl<'a> BufReadDecoderError<'a> {
/// Replace UTF-8 errors with U+FFFD
pub fn lossy(self) -> Result<&'static str, io::Error> {
match self {
BufReadDecoderError::Io(error) => Err(error),
BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
}
}
}
impl<'a> fmt::Display for BufReadDecoderError<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
BufReadDecoderError::InvalidByteSequence(bytes) => {
write!(f, "invalid byte sequence: {:02x?}", bytes)
}
BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
}
}
}
impl<'a> Error for BufReadDecoderError<'a> {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match *self {
BufReadDecoderError::InvalidByteSequence(_) => None,
BufReadDecoderError::Io(ref err) => Some(err),
}
}
}
impl<B: BufRead> BufReadDecoder<B> {
/// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
let mut decoder = Self::new(buf_read);
let mut string = String::new();
while let Some(result) = decoder.next_lossy() {
string.push_str(result?)
}
Ok(string)
}
pub fn new(buf_read: B) -> Self {
Self {
buf_read,
bytes_consumed: 0,
incomplete: Incomplete::empty(),
}
}
/// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
self.next_strict().map(|result| result.or_else(|e| e.lossy()))
}
/// Decode and consume the next chunk of UTF-8 input.
///
/// This method is intended to be called repeatedly until it returns `None`,
/// which represents EOF from the underlying byte stream.
/// This is similar to `Iterator::next`,
/// except that decoded chunks borrow the decoder (~iterator)
/// so they need to be handled or copied before the next chunk can start decoding.
pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
enum BytesSource {
BufRead(usize),
Incomplete,
}
macro_rules! try_io {
($io_result: expr) => {
match $io_result {
Ok(value) => value,
Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
}
}
}
let (source, result) = loop {
if self.bytes_consumed > 0 {
self.buf_read.consume(self.bytes_consumed);
self.bytes_consumed = 0;
}
let buf = try_io!(self.buf_read.fill_buf());
// Force loop iteration to go through an explicit `continue`
enum Unreachable {}
let _: Unreachable = if self.incomplete.is_empty() {
if buf.is_empty() {
return None // EOF
}
match str::from_utf8(buf) {
Ok(_) => {
break (BytesSource::BufRead(buf.len()), Ok(()))
}
Err(error) => {
let valid_up_to = error.valid_up_to();
if valid_up_to > 0 {
break (BytesSource::BufRead(valid_up_to), Ok(()))
}
match error.error_len() {
Some(invalid_sequence_length) => {
break (BytesSource::BufRead(invalid_sequence_length), Err(()))
}
None => {
self.bytes_consumed = buf.len();
self.incomplete = Incomplete::new(buf);
// need more input bytes
continue
}
}
}
}
} else {
if buf.is_empty() {
break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point
}
let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
self.bytes_consumed = consumed;
match opt_result {
None => {
// need more input bytes
continue
}
Some(result) => {
break (BytesSource::Incomplete, result)
}
}
};
};
let bytes = match source {
BytesSource::BufRead(byte_count) => {
self.bytes_consumed = byte_count;
let buf = try_io!(self.buf_read.fill_buf());
&buf[..byte_count]
}
BytesSource::Incomplete => {
self.incomplete.take_buffer()
}
};
match result {
Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
}
}
}