chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

120
vendor/utf8_iter/src/indices.rs vendored Normal file
View File

@@ -0,0 +1,120 @@
// The code in this file was adapted from the CharIndices implementation of
// the Rust standard library at revision ab32548539ec38a939c1b58599249f3b54130026
// (https://github.com/rust-lang/rust/blob/ab32548539ec38a939c1b58599249f3b54130026/library/core/src/str/iter.rs).
//
// Excerpt from https://github.com/rust-lang/rust/blob/ab32548539ec38a939c1b58599249f3b54130026/COPYRIGHT ,
// which refers to
// https://github.com/rust-lang/rust/blob/ab32548539ec38a939c1b58599249f3b54130026/LICENSE-APACHE
// and
// https://github.com/rust-lang/rust/blob/ab32548539ec38a939c1b58599249f3b54130026/LICENSE-MIT
// :
//
// For full authorship information, see the version control history or
// https://thanks.rust-lang.org
//
// Except as otherwise noted (below and/or in individual files), Rust is
// licensed under the Apache License, Version 2.0 <LICENSE-APACHE> or
// <http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT> or <http://opensource.org/licenses/MIT>, at your option.
use super::Utf8Chars;
use core::iter::FusedIterator;
/// An iterator over the [`char`]s and their positions.
#[derive(Clone, Debug)]
#[must_use = "iterators are lazy and do nothing unless consumed"]
pub struct Utf8CharIndices<'a> {
front_offset: usize,
iter: Utf8Chars<'a>,
}
impl<'a> Iterator for Utf8CharIndices<'a> {
type Item = (usize, char);
#[inline]
fn next(&mut self) -> Option<(usize, char)> {
let pre_len = self.as_slice().len();
match self.iter.next() {
None => None,
Some(ch) => {
let index = self.front_offset;
let len = self.as_slice().len();
self.front_offset += pre_len - len;
Some((index, ch))
}
}
}
#[inline]
fn count(self) -> usize {
self.iter.count()
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
#[inline]
fn last(mut self) -> Option<(usize, char)> {
// No need to go through the entire string.
self.next_back()
}
}
impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, char)> {
self.iter.next_back().map(|ch| {
let index = self.front_offset + self.as_slice().len();
(index, ch)
})
}
}
impl FusedIterator for Utf8CharIndices<'_> {}
impl<'a> Utf8CharIndices<'a> {
#[inline(always)]
/// Creates the iterator from a byte slice.
pub fn new(bytes: &'a [u8]) -> Self {
Utf8CharIndices::<'a> {
front_offset: 0,
iter: Utf8Chars::new(bytes),
}
}
/// Views the underlying data as a subslice of the original data.
///
/// This has the same lifetime as the original slice, and so the
/// iterator can continue to be used while this exists.
#[must_use]
#[inline]
pub fn as_slice(&self) -> &'a [u8] {
self.iter.as_slice()
}
/// Returns the byte position of the next character, or the length
/// of the underlying string if there are no more characters.
///
/// # Examples
///
/// ```
/// use utf8_iter::Utf8CharsEx;
/// let mut chars = "a楽".as_bytes().char_indices();
///
/// assert_eq!(chars.offset(), 0);
/// assert_eq!(chars.next(), Some((0, 'a')));
///
/// assert_eq!(chars.offset(), 1);
/// assert_eq!(chars.next(), Some((1, '楽')));
///
/// assert_eq!(chars.offset(), 4);
/// assert_eq!(chars.next(), None);
/// ```
#[inline]
#[must_use]
pub fn offset(&self) -> usize {
self.front_offset
}
}

282
vendor/utf8_iter/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,282 @@
// Copyright Mozilla Foundation
//
// Licensed under the Apache License (Version 2.0), or the MIT license,
// (the "Licenses") at your option. You may not use this file except in
// compliance with one of the Licenses. You may obtain copies of the
// Licenses at:
//
// https://www.apache.org/licenses/LICENSE-2.0
// https://opensource.org/licenses/MIT
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licenses is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licenses for the specific language governing permissions and
// limitations under the Licenses.
#![no_std]
//! Provides iteration by `char` over `&[u8]` containing potentially-invalid
//! UTF-8 such that errors are handled according to the [WHATWG Encoding
//! Standard](https://encoding.spec.whatwg.org/#utf-8-decoder) (i.e. the same
//! way as in `String::from_utf8_lossy`).
//!
//! The trait `Utf8CharsEx` provides the convenience method `chars()` on
//! byte slices themselves instead of having to use the more verbose
//! `Utf8Chars::new(slice)`.
//!
//! ```rust
//! use utf8_iter::Utf8CharsEx;
//! let data = b"\xFF\xC2\xE2\xE2\x98\xF0\xF0\x9F\xF0\x9F\x92\xE2\x98\x83";
//! let from_iter: String = data.chars().collect();
//! let from_std = String::from_utf8_lossy(data);
//! assert_eq!(from_iter, from_std);
//! ```
mod indices;
mod report;
pub use crate::indices::Utf8CharIndices;
pub use crate::report::ErrorReportingUtf8Chars;
pub use crate::report::Utf8CharsError;
use core::iter::FusedIterator;
#[repr(align(64))] // Align to cache lines
struct Utf8Data {
pub table: [u8; 384],
}
// This is generated code copied and pasted from utf_8.rs of encoding_rs.
// Please don't edit by hand but instead regenerate as instructed in that
// file.
static UTF8_DATA: Utf8Data = Utf8Data {
table: [
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
],
};
// End manually copypasted generated code.
#[inline(always)]
fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
i.wrapping_sub(start) <= (end - start)
}
/// Iterator by `char` over `&[u8]` that contains
/// potentially-invalid UTF-8. See the crate documentation.
#[derive(Debug, Clone)]
pub struct Utf8Chars<'a> {
remaining: &'a [u8],
}
impl<'a> Utf8Chars<'a> {
#[inline(always)]
/// Creates the iterator from a byte slice.
pub fn new(bytes: &'a [u8]) -> Self {
Utf8Chars::<'a> { remaining: bytes }
}
/// Views the current remaining data in the iterator as a subslice
/// of the original slice.
#[inline(always)]
pub fn as_slice(&self) -> &'a [u8] {
self.remaining
}
#[inline(never)]
fn next_fallback(&mut self) -> Option<char> {
if self.remaining.is_empty() {
return None;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(char::from(first));
}
if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
self.remaining = &self.remaining[1..];
return Some('\u{FFFD}');
}
let second = self.remaining[1];
let (lower_bound, upper_bound) = match first {
0xE0 => (0xA0, 0xBF),
0xED => (0x80, 0x9F),
0xF0 => (0x90, 0xBF),
0xF4 => (0x80, 0x8F),
_ => (0x80, 0xBF),
};
if !in_inclusive_range8(second, lower_bound, upper_bound) {
self.remaining = &self.remaining[1..];
return Some('\u{FFFD}');
}
if first < 0xE0 {
self.remaining = &self.remaining[2..];
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
return Some(unsafe { char::from_u32_unchecked(point) });
}
if self.remaining.len() == 2 {
self.remaining = &self.remaining[2..];
return Some('\u{FFFD}');
}
let third = self.remaining[2];
if !in_inclusive_range8(third, 0x80, 0xBF) {
self.remaining = &self.remaining[2..];
return Some('\u{FFFD}');
}
if first < 0xF0 {
self.remaining = &self.remaining[3..];
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
return Some(unsafe { char::from_u32_unchecked(point) });
}
// At this point, we have a valid 3-byte prefix of a
// four-byte sequence that has to be incomplete, because
// otherwise `next()` would have succeeded.
self.remaining = &self.remaining[3..];
Some('\u{FFFD}')
}
}
impl<'a> Iterator for Utf8Chars<'a> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
// Not delegating directly to `ErrorReportingUtf8Chars` to avoid
// an extra branch in the common case based on a cursory inspection
// of generated code in a similar case. Be sure to inspect the
// generated code as inlined into an actual usage site carefully
// if attempting to consolidate the source code here.
// This loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
loop {
if self.remaining.len() < 4 {
break;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(char::from(first));
}
let second = self.remaining[1];
if in_inclusive_range8(first, 0xC2, 0xDF) {
if !in_inclusive_range8(second, 0x80, 0xBF) {
break;
}
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
self.remaining = &self.remaining[2..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
// This table-based formulation was benchmark-based in encoding_rs,
// but it hasn't been re-benchmarked in this iterator context.
let third = self.remaining[2];
if first < 0xF0 {
if ((UTF8_DATA.table[usize::from(second)]
& UTF8_DATA.table[usize::from(first) + 0x80])
| (third >> 6))
!= 2
{
break;
}
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
self.remaining = &self.remaining[3..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
let fourth = self.remaining[3];
if (u16::from(
UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
break;
}
let point = ((u32::from(first) & 0x7) << 18)
| ((u32::from(second) & 0x3F) << 12)
| ((u32::from(third) & 0x3F) << 6)
| (u32::from(fourth) & 0x3F);
self.remaining = &self.remaining[4..];
return Some(unsafe { char::from_u32_unchecked(point) });
}
self.next_fallback()
}
}
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<char> {
if self.remaining.is_empty() {
return None;
}
let mut attempt = 1;
for b in self.remaining.iter().rev() {
if b & 0xC0 != 0x80 {
let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
let mut inner = Utf8Chars::new(tail);
let candidate = inner.next();
if inner.as_slice().is_empty() {
self.remaining = head;
return candidate;
}
break;
}
if attempt == 4 {
break;
}
attempt += 1;
}
self.remaining = &self.remaining[..self.remaining.len() - 1];
Some('\u{FFFD}')
}
}
impl FusedIterator for Utf8Chars<'_> {}
/// Convenience trait that adds `chars()` and `char_indices()` methods
/// similar to the ones on string slices to byte slices.
pub trait Utf8CharsEx {
fn chars(&self) -> Utf8Chars<'_>;
fn char_indices(&self) -> Utf8CharIndices<'_>;
}
impl Utf8CharsEx for [u8] {
/// Convenience method for creating an UTF-8 iterator
/// for the slice.
#[inline]
fn chars(&self) -> Utf8Chars<'_> {
Utf8Chars::new(self)
}
/// Convenience method for creating a byte index and
/// UTF-8 iterator for the slice.
#[inline]
fn char_indices(&self) -> Utf8CharIndices<'_> {
Utf8CharIndices::new(self)
}
}
// No manually-written tests for forward-iteration, because the code passed multiple
// days of fuzzing comparing with known-good behavior.

234
vendor/utf8_iter/src/report.rs vendored Normal file
View File

@@ -0,0 +1,234 @@
// Copyright Mozilla Foundation
//
// Licensed under the Apache License (Version 2.0), or the MIT license,
// (the "Licenses") at your option. You may not use this file except in
// compliance with one of the Licenses. You may obtain copies of the
// Licenses at:
//
// https://www.apache.org/licenses/LICENSE-2.0
// https://opensource.org/licenses/MIT
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licenses is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licenses for the specific language governing permissions and
// limitations under the Licenses.
use crate::in_inclusive_range8;
use crate::UTF8_DATA;
use core::fmt::Formatter;
use core::iter::FusedIterator;
/// A type for signaling UTF-8 errors.
///
/// Note: `core::error::Error` is not implemented due to implementing it
/// being an [unstable feature][1] at the time of writing.
///
/// [1]: https://github.com/rust-lang/rust/issues/103765
#[derive(Debug, PartialEq)]
#[non_exhaustive]
pub struct Utf8CharsError;
impl core::fmt::Display for Utf8CharsError {
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), core::fmt::Error> {
write!(f, "byte sequence not well-formed UTF-8")
}
}
/// Iterator by `Result<char,Utf8CharsError>` over `&[u8]` that contains
/// potentially-invalid UTF-8. There is exactly one `Utf8CharsError` per
/// each error as defined by the WHATWG Encoding Standard.
///
/// ```
/// let s = b"a\xFFb\xFF\x80c\xF0\x9F\xA4\xA6\xF0\x9F\xA4\xF0\x9F\xF0d";
/// let plain = utf8_iter::Utf8Chars::new(s);
/// let reporting = utf8_iter::ErrorReportingUtf8Chars::new(s);
/// assert!(plain.eq(reporting.map(|r| r.unwrap_or('\u{FFFD}'))));
/// ```
#[derive(Debug, Clone)]
pub struct ErrorReportingUtf8Chars<'a> {
remaining: &'a [u8],
}
impl<'a> ErrorReportingUtf8Chars<'a> {
#[inline(always)]
/// Creates the iterator from a byte slice.
pub fn new(bytes: &'a [u8]) -> Self {
ErrorReportingUtf8Chars::<'a> { remaining: bytes }
}
/// Views the current remaining data in the iterator as a subslice
/// of the original slice.
#[inline(always)]
pub fn as_slice(&self) -> &'a [u8] {
self.remaining
}
#[inline(never)]
fn next_fallback(&mut self) -> Option<Result<char, Utf8CharsError>> {
if self.remaining.is_empty() {
return None;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(Ok(char::from(first)));
}
if !in_inclusive_range8(first, 0xC2, 0xF4) || self.remaining.len() == 1 {
self.remaining = &self.remaining[1..];
return Some(Err(Utf8CharsError));
}
let second = self.remaining[1];
let (lower_bound, upper_bound) = match first {
0xE0 => (0xA0, 0xBF),
0xED => (0x80, 0x9F),
0xF0 => (0x90, 0xBF),
0xF4 => (0x80, 0x8F),
_ => (0x80, 0xBF),
};
if !in_inclusive_range8(second, lower_bound, upper_bound) {
self.remaining = &self.remaining[1..];
return Some(Err(Utf8CharsError));
}
if first < 0xE0 {
self.remaining = &self.remaining[2..];
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
}
if self.remaining.len() == 2 {
self.remaining = &self.remaining[2..];
return Some(Err(Utf8CharsError));
}
let third = self.remaining[2];
if !in_inclusive_range8(third, 0x80, 0xBF) {
self.remaining = &self.remaining[2..];
return Some(Err(Utf8CharsError));
}
if first < 0xF0 {
self.remaining = &self.remaining[3..];
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
}
// At this point, we have a valid 3-byte prefix of a
// four-byte sequence that has to be incomplete, because
// otherwise `next()` would have succeeded.
self.remaining = &self.remaining[3..];
Some(Err(Utf8CharsError))
}
}
impl<'a> Iterator for ErrorReportingUtf8Chars<'a> {
type Item = Result<char, Utf8CharsError>;
#[inline]
fn next(&mut self) -> Option<Result<char, Utf8CharsError>> {
// This loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
loop {
if self.remaining.len() < 4 {
break;
}
let first = self.remaining[0];
if first < 0x80 {
self.remaining = &self.remaining[1..];
return Some(Ok(char::from(first)));
}
let second = self.remaining[1];
if in_inclusive_range8(first, 0xC2, 0xDF) {
if !in_inclusive_range8(second, 0x80, 0xBF) {
break;
}
let point = ((u32::from(first) & 0x1F) << 6) | (u32::from(second) & 0x3F);
self.remaining = &self.remaining[2..];
return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
}
// This table-based formulation was benchmark-based in encoding_rs,
// but it hasn't been re-benchmarked in this iterator context.
let third = self.remaining[2];
if first < 0xF0 {
if ((UTF8_DATA.table[usize::from(second)]
& UTF8_DATA.table[usize::from(first) + 0x80])
| (third >> 6))
!= 2
{
break;
}
let point = ((u32::from(first) & 0xF) << 12)
| ((u32::from(second) & 0x3F) << 6)
| (u32::from(third) & 0x3F);
self.remaining = &self.remaining[3..];
return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
}
let fourth = self.remaining[3];
if (u16::from(
UTF8_DATA.table[usize::from(second)] & UTF8_DATA.table[usize::from(first) + 0x80],
) | u16::from(third >> 6)
| (u16::from(fourth & 0xC0) << 2))
!= 0x202
{
break;
}
let point = ((u32::from(first) & 0x7) << 18)
| ((u32::from(second) & 0x3F) << 12)
| ((u32::from(third) & 0x3F) << 6)
| (u32::from(fourth) & 0x3F);
self.remaining = &self.remaining[4..];
return Some(Ok(unsafe { char::from_u32_unchecked(point) }));
}
self.next_fallback()
}
}
impl<'a> DoubleEndedIterator for ErrorReportingUtf8Chars<'a> {
#[inline]
fn next_back(&mut self) -> Option<Result<char, Utf8CharsError>> {
if self.remaining.is_empty() {
return None;
}
let mut attempt = 1;
for b in self.remaining.iter().rev() {
if b & 0xC0 != 0x80 {
let (head, tail) = self.remaining.split_at(self.remaining.len() - attempt);
let mut inner = ErrorReportingUtf8Chars::new(tail);
let candidate = inner.next();
if inner.as_slice().is_empty() {
self.remaining = head;
return candidate;
}
break;
}
if attempt == 4 {
break;
}
attempt += 1;
}
self.remaining = &self.remaining[..self.remaining.len() - 1];
Some(Err(Utf8CharsError))
}
}
impl FusedIterator for ErrorReportingUtf8Chars<'_> {}
#[cfg(test)]
mod tests {
use crate::ErrorReportingUtf8Chars;
// Should be a static assert, but not taking a dependency for this.
#[test]
fn test_size() {
assert_eq!(
core::mem::size_of::<Option<<ErrorReportingUtf8Chars<'_> as Iterator>::Item>>(),
core::mem::size_of::<Option<char>>()
);
}
#[test]
fn test_eq() {
let a: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
let a_again: <ErrorReportingUtf8Chars<'_> as Iterator>::Item = Ok('a');
assert_eq!(a, a_again);
}
}