chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,490 @@
/* Copyright 2018-2020 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail.
//!
//! To be predictable, all errors consume one element each.
//!
//! The iterator adaptors produce neither offset nor element length to work
//! well with other adaptors,
//! while the slice iterators yield both to make more advanced use cases easy.
use crate::errors::{Utf16FirstUnitError, Utf16PairError, Utf8Error};
use crate::errors::Utf16SliceError::*;
use crate::errors::Utf16PairError::*;
use crate::errors::Utf8ErrorKind::*;
use crate::utf8_char::Utf8Char;
use crate::utf16_char::Utf16Char;
use crate::traits::U16UtfExt;
extern crate core;
use core::borrow::Borrow;
use core::fmt::{self, Debug};
use core::iter::Chain;
use core::option;
/// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s.
///
/// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharMerger<B:Borrow<u8>, I:Iterator<Item=B>> {
iter: I,
/// number of bytes that were read before an error was detected
after_err_leftover: u8,
/// stack because it simplifies popping.
after_err_stack: [u8; 3],
}
impl<B:Borrow<u8>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf8CharMerger<B, I> {
fn from(t: T) -> Self {
Utf8CharMerger {
iter: t.into_iter(),
after_err_leftover: 0,
after_err_stack: [0; 3],
}
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Utf8CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced by `.next()` was an `Err`,
/// up to three following bytes might be missing.
/// The exact number of missing bytes for each error type should not be relied on.
///
/// # Examples
///
/// Three bytes swallowed:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// let mut inner: std::slice::Iter<u8> = merger.into_inner();
/// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared
/// ```
///
/// All bytes present:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xb0FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
///
/// Two bytes missing:
/// ```
/// # use encode_unicode::IterExt;
/// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars();
/// assert!(merger.next().unwrap().is_err());
/// assert_eq!(merger.into_inner().next(), Some(&b'F'));
/// ```
pub fn into_inner(self) -> I {
self.iter
}
fn save(&mut self, bytes: &[u8;4], len: usize) {
// forget bytes[0] and push the others onto self.after_err_stack (in reverse).
for &after_err in bytes[1..len].iter().rev() {
self.after_err_stack[self.after_err_leftover as usize] = after_err;
self.after_err_leftover += 1;
}
}
/// Reads len-1 bytes into bytes[1..]
fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),Utf8Error> {
// This is the only function that pushes onto after_err_stack,
// and it checks that all bytes are continuation bytes before fetching the next one.
// Therefore only the last byte retrieved can be a non-continuation byte.
// That last byte is also the last to be retrieved from after_err.
//
// Before this function is called, there has been retrieved at least one byte.
// If that byte was a continuation byte, next() produces an error
// and won't call this function.
// Therefore, we know that after_err is empty at this point.
// This means that we can use self.iter directly, and knows where to start pushing
debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack);
for i in 1..len {
if let Some(extra) = self.iter.next() {
let extra = *extra.borrow();
bytes[i] = extra;
if extra & 0b1100_0000 != 0b1000_0000 {
// not a continuation byte
self.save(bytes, i+1);
return Err(Utf8Error{ kind: InterruptedSequence })
}
} else {
self.save(bytes, i);
return Err(Utf8Error{ kind: TooFewBytes });
}
}
Ok(())
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>> Iterator for Utf8CharMerger<B,I> {
type Item = Result<Utf8Char,Utf8Error>;
fn next(&mut self) -> Option<Self::Item> {
let first: u8;
if self.after_err_leftover != 0 {
self.after_err_leftover -= 1;
first = self.after_err_stack[self.after_err_leftover as usize];
} else if let Some(next) = self.iter.next() {
first = *next.borrow();
} else {
return None;
}
unsafe {
let mut bytes = [first, 0, 0, 0];
let ok = match first {
0b0000_0000..=0b0111_1111 => {/*1 and */Ok(())},
0b1100_0010..=0b1101_1111 => {//2 and not overlong
self.extra(&mut bytes, 2) // no extra validation required
},
0b1110_0000..=0b1110_1111 => {//3
if let Err(e) = self.extra(&mut bytes, 3) {
Err(e)
} else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 {
self.save(&bytes, 3);
Err(Utf8Error{ kind: OverlongEncoding })
} else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 {
self.save(&bytes, 3);
Err(Utf8Error{ kind: Utf16ReservedCodepoint })
} else {
Ok(())
}
},
0b1111_0000..=0b1111_0100 => {//4
if let Err(e) = self.extra(&mut bytes, 4) {
Err(e)
} else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 {
self.save(&bytes, 4);
Err(Utf8Error{ kind: OverlongEncoding })
} else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 {
self.save(&bytes, 4);
Err(Utf8Error{ kind: TooHighCodepoint })
} else {
Ok(())
}
},
0b1000_0000..=0b1011_1111 => {// continuation byte
Err(Utf8Error{ kind: UnexpectedContinuationByte })
},
0b1100_0000..=0b1100_0001 => {// 2 and overlong
Err(Utf8Error{ kind: NonUtf8Byte })
},
0b1111_0101..=0b1111_0111 => {// 4 and too high codepoint
Err(Utf8Error{ kind: NonUtf8Byte })
},
0b1111_1000..=0b1111_1111 => {
Err(Utf8Error{ kind: NonUtf8Byte })
},
};
Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) ))
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 4; // don't bother rounding up or accounting for after_err
// handle edge case of max > usize::MAX-3 just in case.
// Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe.
let max = iter_max.and_then(|max| {
max.checked_add(self.after_err_leftover as usize)
});
(min, max)
}
}
impl<B:Borrow<u8>, I:Iterator<Item=B>+Debug> Debug for Utf8CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut in_order = [0u8; 3];
for i in 0..self.after_err_leftover as usize {
in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1];
}
fmtr.debug_struct("Utf8CharMerger")
.field("buffered", &&in_order[..self.after_err_leftover as usize])
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf8CharDecoder<'a> {
slice: &'a[u8],
index: usize,
}
impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> {
fn from(s: &[u8]) -> Utf8CharDecoder {
Utf8CharDecoder { slice: s, index: 0 }
}
}
impl<'a> Utf8CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error
/// are never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices();
/// assert!(iter.next().unwrap().1.is_err());
/// assert_eq!(iter.as_slice(), b"\xa1\xb2FS");
/// ```
pub fn as_slice(&self) -> &'a[u8] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf8CharDecoder<'a> {
type Item = (usize, Result<Utf8Char,Utf8Error>, usize);
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf8Char::from_slice_start(&self.slice[self.index..]) {
Ok((u8c, len)) => {
self.index += len;
Some((start, Ok(u8c), len))
},
Err(_) if self.slice.len() <= self.index => None,
Err(e) => {
self.index += 1;
Some((start, Err(e), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let bytes = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of 4-byte codepoints, so buffers
// allocated with the lower bound will have to be grown anyway.
(bytes/4, Some(bytes))
}
}
impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
if self.index < self.slice.len() {
let extras = self.slice.iter()
.rev()
.take_while(|&b| b & 0b1100_0000 == 0b1000_0000 )
.count();
let starts = self.slice.len() - (extras+1);
match Utf8Char::from_slice_start(&self.slice[starts..]) {
Ok((u8c,len)) if len == 1+extras => {
self.slice = &self.slice[..starts];
Some((starts, Ok(u8c), len))
},
// This enures errors for every byte in both directions,
// but means overlong and codepoint errors will be turned into
// tooshort errors.
Err(e) if extras == 0 => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(e), 1))
},
_ => {
self.slice = &self.slice[..self.slice.len()-1];
Some((self.slice.len()-1, Err(Utf8Error{ kind: UnexpectedContinuationByte }), 1))
},
}
} else {
None
}
}
}
impl<'a> Debug for Utf8CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice())
}
}
/// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s.
///
/// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharMerger<B:Borrow<u16>, I:Iterator<Item=B>> {
iter: I,
/// Used when a trailing surrogate was expected, the u16 can be any value.
prev: Option<B>,
}
impl<B:Borrow<u16>, I:Iterator<Item=B>, T:IntoIterator<IntoIter=I,Item=B>>
From<T> for Utf16CharMerger<B,I> {
fn from(t: T) -> Self {
Utf16CharMerger { iter: t.into_iter(), prev: None }
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Utf16CharMerger<B,I> {
/// Extract the inner iterator.
///
/// If the last item produced was an `Err`, the first unit might be missing.
///
/// # Examples
///
/// Unit right after an error missing
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger
/// ```
///
/// Error that doesn't swallow any units
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate)));
/// let mut inner: std::slice::Iter<u16> = merger.into_inner();
/// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed
/// ```
pub fn into_inner(self) -> I {
self.iter
}
/// Returns an iterator over the remaining units.
/// Unlike `into_inner()` this will never drop any units.
///
/// The exact type of the returned iterator should not be depended on.
///
/// # Examples
///
/// ```
/// # use encode_unicode::IterExt;
/// # use encode_unicode::error::Utf16PairError;
/// let slice = [0xd901, 'F' as u16, 'S' as u16];
/// let mut merger = slice.iter().to_utf16chars();
/// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate)));
/// let mut remaining = merger.into_remaining_units();
/// assert_eq!(remaining.next(), Some('F' as u16).as_ref());
/// ```
pub fn into_remaining_units(self) -> Chain<option::IntoIter<B>,I> {
self.prev.into_iter().chain(self.iter)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>> Iterator for Utf16CharMerger<B,I> {
type Item = Result<Utf16Char,Utf16PairError>;
fn next(&mut self) -> Option<Self::Item> {
let first = self.prev.take().or_else(|| self.iter.next() );
first.map(|first| unsafe {
match first.borrow().utf16_needs_extra_unit() {
Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])),
Ok(true) => match self.iter.next() {
Some(second) => match second.borrow().utf16_needs_extra_unit() {
Err(Utf16FirstUnitError) => Ok(Utf16Char::from_tuple_unchecked((
*first.borrow(),
Some(*second.borrow())
))),
Ok(_) => {
self.prev = Some(second);
Err(Utf16PairError::UnmatchedLeadingSurrogate)
}
},
None => Err(Utf16PairError::Incomplete)
},
Err(Utf16FirstUnitError) => Err(Utf16PairError::UnexpectedTrailingSurrogate),
}
})
}
fn size_hint(&self) -> (usize,Option<usize>) {
let (iter_min, iter_max) = self.iter.size_hint();
// cannot be exact, so KISS
let min = iter_min / 2; // don't bother rounding up or accounting for self.prev
let max = match (iter_max, &self.prev) {
(Some(max), &Some(_)) => max.checked_add(1),
(max, _) => max,
};
(min, max)
}
}
impl<B:Borrow<u16>, I:Iterator<Item=B>+Debug> Debug for Utf16CharMerger<B,I> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_struct("Utf16CharMerger")
.field("buffered", &self.prev.as_ref().map(|b| *b.borrow() ))
.field("inner", &self.iter)
.finish()
}
}
/// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces
/// offsets and lengths, but can only iterate over slices.
///
/// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices)
/// for examples and error handling.
#[derive(Clone, Default)]
pub struct Utf16CharDecoder<'a> {
slice: &'a[u16],
index: usize,
}
impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> {
fn from(s: &'a[u16]) -> Self {
Utf16CharDecoder{ slice: s, index: 0 }
}
}
impl<'a> Utf16CharDecoder<'a> {
/// Extract the remainder of the source slice.
///
/// # Examples
///
/// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed:
/// ```
/// # use encode_unicode::SliceExt;
/// # use encode_unicode::error::Utf16PairError;
/// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices();
/// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1)));
/// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]);
/// ```
pub fn as_slice(&self) -> &[u16] {
&self.slice[self.index..]
}
}
impl<'a> Iterator for Utf16CharDecoder<'a> {
type Item = (usize,Result<Utf16Char,Utf16PairError>,usize);
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let start = self.index;
match Utf16Char::from_slice_start(self.as_slice()) {
Ok((u16c,len)) => {
self.index += len;
Some((start, Ok(u16c), len))
},
Err(EmptySlice) => None,
Err(FirstIsTrailingSurrogate) => {
self.index += 1;
Some((start, Err(UnexpectedTrailingSurrogate), 1))
},
Err(SecondIsNotTrailingSurrogate) => {
self.index += 1;
Some((start, Err(UnmatchedLeadingSurrogate), 1))
},
Err(MissingSecond) => {
self.index = self.slice.len();
Some((start, Err(Incomplete), 1))
}
}
}
#[inline]
fn size_hint(&self) -> (usize,Option<usize>) {
let units = self.slice.len() - self.index;
// Cannot be exact, so KISS and don't bother rounding up.
// The slice is unlikely be full of surrogate pairs, so buffers
// allocated with the lower bound will have to be grown anyway.
(units/2, Some(units))
}
}
impl<'a> Debug for Utf16CharDecoder<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice())
}
}

309
vendor/encode_unicode/src/errors.rs vendored Normal file
View File

@@ -0,0 +1,309 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
//! Boilerplate-y error types.
//!
//! The discriminant values of the enums might change in minor releases.
//! (to reduce the size of the `Result<>` types they are returned in)
extern crate core;
use core::fmt::{self,Display,Formatter};
use core::ops::RangeInclusive;
#[cfg(feature="std")]
use std::error::Error;
macro_rules! description {($err:ty, $desc:expr) => {
#[cfg(not(feature="std"))]
impl $err {
#[allow(missing_docs)]
pub fn description(&self) -> &'static str {
($desc)(self)
}
}
#[cfg(feature="std")]
impl Error for $err {
fn description(&self) -> &'static str {
($desc)(self)
}
}
impl Display for $err {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
#![allow(deprecated)] // calling our own function
write!(fmtr, "{}", self.description())
}
}
}}
macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => {
$(#[$doc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct $err;
description!{$err, |_| $desc }
}}
single_cause!{
/// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit)
/// when called on an `u16` that's a trailing surrogate.
Utf16FirstUnitError => "is a trailing surrogate"
}
single_cause!{
/// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii)
/// for bytes that are not ASCII characters.
NonAsciiError => "not an ASCII character"
}
single_cause!{
/// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp)
/// for units that are not a standalone codepoint.
NonBmpError => "not a codepoint in the basic multilingual plane"
}
single_cause!{
/// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start)
/// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start)
/// when called with an empty string.
EmptyStrError => "is empty"
}
macro_rules! simple {($(#[$tydoc:meta])* $err:ident {
$( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+
} ) => {
$(#[$tydoc])*
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum $err {
$( $(#[$vardoc])* $variant, )*
}
description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} }
}}
simple!{
/// Error returned when an `u32` is not a valid unicode codepoint.
CodepointError {
/// It's reserved for UTF-16 surrogate pairs.
Utf16Reserved => "is reserved for UTF-16 surrogate pairs",
/// It's higher than the highest codepoint (which is 0x10ffff).
TooHigh => "is higher than the highest codepoint",
}}
use CodepointError::*;
impl CodepointError {
/// Get the range of values for which this error would be given.
pub const fn error_range(self) -> RangeInclusive<u32> {match self {
Utf16Reserved => 0xd8_00..=0xdf_ff,
TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff,
}}
}
simple!{
/// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint.
Utf16ArrayError {
/// The first element is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first element is a trailing surrogate",
/// The second element is needed, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate",
}}
simple!{
/// Error returned when one or two `u16`s are not valid UTF-16.
///
/// They are returned in sinking precedence;
/// The condition that causes the first variant to be returned is checked
/// for before the condition the next variant is returned for.
Utf16TupleError {
/// The first unit is a trailing / low surrogate, which is never valid.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The provided second unit is not necessary.
SuperfluousSecond => "the second unit is superfluous",
/// The first and only unit requires a second unit.
MissingSecond => "the first unit requires a second unit",
/// The second unit is needed and was provided, but is not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned when a slice of `u16`s doesn't start with valid UTF-16.
Utf16SliceError {
/// The slice is empty.
EmptySlice => "the slice is empty",
/// The first unit is a trailing surrogate.
FirstIsTrailingSurrogate => "the first unit is a trailing surrogate",
/// The first and only unit requires a second unit.
MissingSecond => "the first and only unit requires a second one",
/// The first unit requires a second one, but it's not a trailing surrogate.
SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate",
}}
simple!{
/// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator)
/// when it encounters an invalid sequence.
Utf16PairError {
/// A trailing surrogate was not preceeded by a leading surrogate.
UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate",
/// A leading surrogate was followed by an unit that was not a trailing surrogate.
UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate",
/// A trailing surrogate was expected when the end was reached.
Incomplete => "a trailing surrogate was expected when the end was reached",
}}
simple!{
/// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr)
/// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails.
FromStrError {
/// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint.
MultipleCodepoints => "contains more than one codepoint",
/// `Utf8Char` and `Utf16Char` cannot be empty.
Empty => "is empty",
}
}
/// Error returned when an invalid UTF-8 sequence is encountered.
///
/// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors
/// that this type can be returned for.
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub struct Utf8Error {
pub(crate) kind: Utf8ErrorKind,
}
impl Utf8Error {
/// Get the type of error.
pub const fn kind(&self) -> Utf8ErrorKind {
self.kind
}
#[cfg(not(feature="std"))]
#[allow(missing_docs)]
pub const fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
#[cfg(feature="std")]
impl Error for Utf8Error {
fn description(&self) -> &'static str {
utf8_error_description(self.kind)
}
}
impl Display for Utf8Error {
fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result {
fmtr.write_str(utf8_error_description(self.kind))
}
}
/// The types of errors that can occur when decoding a UTF-8 codepoint.
///
/// The variants are more technical than what an end user is likely interested
/// in, but might be useful for deciding how to handle the error.
///
/// They can be grouped into three categories:
/// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`.
/// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte`
/// and `InterruptedSequence`.
/// (Broken UTF-8 sequence).
/// * Less likely to happen accidentaly and might be malicious:
/// `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`.
/// Note that theese can still be caused by certain valid latin-1 strings
/// such as `"Á©"` (`b"\xC1\xA9"`).
#[derive(Clone,Copy, Debug, PartialEq,Eq)]
pub enum Utf8ErrorKind {
/// There are too few bytes to decode the codepoint.
///
/// This can happen when a slice is empty or too short, or an iterator
/// returned `None` while in the middle of a codepoint.
/// This error is never produced by functions accepting fixed-size
/// `[u8; 4]` arrays.
///
/// If decoding text coming chunked (such as in buffers passed to `Read`),
/// the remaing bytes should be carried over into the next chunk or buffer.
/// (including the byte this error was produced for.)
TooFewBytes,
/// A byte which is never used by well-formed UTF-8 was encountered.
///
/// This means that the input is using a different encoding,
/// is corrupted or binary.
///
/// This error is returned when a byte in the following ranges
/// is encountered anywhere in an UTF-8 sequence:
///
/// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding
/// of a single-byte, ASCII, character, and should therefore never occur.
/// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes.
/// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high
/// codepoint. (above `\u10ffff`)
NonUtf8Byte,
/// The first byte is not a valid start of a codepoint.
///
/// This might happen as a result of slicing into the middle of a codepoint,
/// the input not being UTF-8 encoded or being corrupted.
/// Errors of this type coming right after another error should probably
/// be ignored, unless returned more than three times in a row.
///
/// This error is returned when the first byte has a value in the range
/// `128..=191` (`0b1000_0000..=0b1011_1111`).
UnexpectedContinuationByte,
/// The byte at index 1..=3 should be a continuation byte,
/// but doesn't fit the pattern `0b10xx_xxxx`.
///
/// When the input slice or iterator has too few bytes,
/// [`TooFewBytes`](#Incomplete) is returned instead.
InterruptedSequence,
/// The encoding of the codepoint has so many leading zeroes that it
/// could be a byte shorter.
///
/// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10):
/// Doing so could allow an attacker to circumvent input validation that
/// only checks for ASCII characters, and input characters or strings that
/// would otherwise be rejected, such as `/../`.
///
/// This error is only returned for 3 and 4-byte encodings;
/// `NonUtf8Byte` is returned for bytes that start longer or shorter
/// overlong encodings.
OverlongEncoding,
/// The codepoint is reserved for UTF-16 surrogate pairs.
///
/// (`Utf8Char` cannot be used to work with the
/// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.)
///
/// This error is returned for codepoints in the range `\ud800`..=`\udfff`.
/// (which are three bytes long as UTF-8)
Utf16ReservedCodepoint,
/// The codepoint is higher than `\u10ffff`, which is the highest codepoint
/// unicode permits.
TooHighCodepoint,
}
const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str {
match kind {
Utf8ErrorKind::TooFewBytes => "too few bytes",
Utf8ErrorKind::NonUtf8Byte => "not UTF-8",
Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8",
Utf8ErrorKind::InterruptedSequence => "not UTF-8",
Utf8ErrorKind::OverlongEncoding => "malformed input",
Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input",
Utf8ErrorKind::TooHighCodepoint => "invalid character",
}
}
impl PartialEq<Utf8ErrorKind> for Utf8Error {
fn eq(&self, kind: &Utf8ErrorKind) -> bool {
self.kind == *kind
}
}
impl PartialEq<Utf8Error> for Utf8ErrorKind {
fn eq(&self, error: &Utf8Error) -> bool {
*self == error.kind
}
}

89
vendor/encode_unicode/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,89 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
* Copyright 2018 Aljoscha Meyer
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
/*!
Miscellaneous UTF-8 and UTF-16 types and methods.
# Optional features:
* `#![no_std]`-mode: There are a few differences:
* `Error` doesn't exist, but `description()` is made available as an inherent impl.
* `Extend`/`FromIterator`-implementations for `String`/`Vec<u8>`/`Vec<u16>` are missing.
* There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`.
This feature is enabled by setting `default-features=false` in `Cargo.toml`:
`encode_unicode = {version="0.3.4", default-features=false}`
* Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate:
Convert `Utf8Char` and `Utf16Char` to and from
[`ascii::AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html).
# Minimum supported Rust version
The minimum supported Rust version for 1.0.\* releases is 1.56.
Later 1.y.0 releases might require newer Rust versions, but the three most
recent stable releases at the time of publishing will always be supported.
For example this means that if the current stable Rust version is 1.66 when
`encode_unicode` 1.1.0 is released, then `encode_unicode` 1.1.\* will
not require a newer Rust version than 1.63.
[crates.io page](https://crates.io/crates/encode_unicode)
[github repository](https://github.com/tormol/encode_unicode)
*/
#![cfg_attr(not(feature="std"), no_std)]
#![warn(missing_docs, unsafe_op_in_unsafe_fn)]
#![allow(
clippy::unusual_byte_groupings,// I sometimes group into UTF-8 control part and codepoint part
clippy::derive_hash_xor_eq,// tested
clippy::len_without_is_empty,// the character types are never empty
clippy::needless_return,// `foo.bar();\n foo` looks unfinished
clippy::redundant_closure_call,// not redundant in macros
clippy::cast_lossless,// the sizes are part of the struct name and so won't change
clippy::many_single_char_names,// the variables are in different scopes
clippy::cmp_owned,// smaller than pointer, and no allocations anyway
clippy::wrong_self_convention,// smaller than pointer
clippy::needless_range_loop,// the suggested iterator chains are less intuitive
clippy::identity_op,// applying a set of opereations with varying arguments to many elements looks nice
clippy::get_first,// .get(0), .get(1) is more readable
clippy::question_mark,// I prefer it very explicit
)]
#![warn(clippy::doc_markdown, clippy::manual_filter_map)]
// opt-in lints that might be interesting to recheck once in a while:
//#![warn(clippy::unwrap_used)]
mod errors;
mod traits;
mod utf8_char;
mod utf8_iterators;
mod utf16_char;
mod utf16_iterators;
mod decoding_iterators;
pub use traits::{CharExt, U8UtfExt, U16UtfExt, StrExt, IterExt, SliceExt};
pub use utf8_char::Utf8Char;
pub use utf16_char::Utf16Char;
pub mod error {// keeping the public interface in one file
//! Errors returned by various conversion methods in this crate.
pub use crate::errors::{FromStrError, EmptyStrError};
pub use crate::errors::{CodepointError, NonAsciiError, NonBmpError};
pub use crate::errors::{Utf8Error, Utf8ErrorKind};
pub use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError};
pub use crate::errors::{Utf16FirstUnitError, Utf16PairError};
}
pub mod iterator {
//! Iterator types that you should rarely need to name
pub use crate::utf8_iterators::{Utf8Iterator, Utf8CharSplitter, Utf8Chars, Utf8CharIndices};
pub use crate::utf16_iterators::{Utf16Iterator, Utf16CharSplitter, Utf16Chars, Utf16CharIndices};
pub use crate::decoding_iterators::{Utf8CharMerger, Utf8CharDecoder};
pub use crate::decoding_iterators::{Utf16CharMerger, Utf16CharDecoder};
}

1012
vendor/encode_unicode/src/traits.rs vendored Normal file

File diff suppressed because it is too large Load Diff

692
vendor/encode_unicode/src/utf16_char.rs vendored Normal file
View File

@@ -0,0 +1,692 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::utf16_iterators::Utf16Iterator;
use crate::traits::{CharExt, U16UtfExt};
use crate::utf8_char::Utf8Char;
use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError};
use crate::errors::{NonBmpError, EmptyStrError, FromStrError};
extern crate core;
use core::{hash,fmt};
use core::cmp::Ordering;
use core::borrow::Borrow;
use core::ops::Deref;
use core::str::FromStr;
#[cfg(feature="std")]
use core::iter::FromIterator;
#[cfg(feature="ascii")]
use core::char;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// When it's a single unit, the second is zero, so Eq works.
// #[derive(Ord)] however, breaks on surrogate pairs.
#[derive(PartialEq,Eq)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-16.
///
/// It can be borrowed as an `u16` slice, and has the same size as `char`.
pub struct Utf16Char {
units: [u16; 2],
}
/////////////////////
//conversion traits//
/////////////////////
impl FromStr for Utf16Char {
type Err = FromStrError;
/// Create an `Utf16Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf16Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf16Char::from_str("a"), Ok(Utf16Char::from('a')));
/// assert_eq!(Utf16Char::from_str("🂠"), Ok(Utf16Char::from('🂠')));
/// assert_eq!(Utf16Char::from_str(""), Err(Empty));
/// assert_eq!(Utf16Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf16Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
match Utf16Char::from_str_start(s) {
Ok((u16c,bytes)) if bytes == s.len() => Ok(u16c),
Ok((_,_)) => Err(FromStrError::MultipleCodepoints),
Err(EmptyStrError) => Err(FromStrError::Empty),
}
}
}
impl From<char> for Utf16Char {
fn from(c: char) -> Self {
let (first, second) = c.to_utf16_tuple();
Utf16Char{ units: [first, second.unwrap_or(0)] }
}
}
impl From<Utf8Char> for Utf16Char {
fn from(utf8: Utf8Char) -> Utf16Char {
let (b, utf8_len) = utf8.to_array();
match utf8_len {
1 => Utf16Char{ units: [b[0] as u16, 0] },
4 => {// need surrogate
let mut first = 0xd800 - (0x01_00_00u32 >> 10) as u16;
first += (b[0] as u16 & 0x07) << 8;
first += (b[1] as u16 & 0x3f) << 2;
first += (b[2] as u16 & 0x30) >> 4;
let mut second = 0xdc00;
second |= (b[2] as u16 & 0x0f) << 6;
second |= b[3] as u16 & 0x3f;
Utf16Char{ units: [first, second] }
},
_ => { // 2 or 3
let mut unit = ((b[0] as u16 & 0x1f) << 6) | (b[1] as u16 & 0x3f);
if utf8_len == 3 {
unit = (unit << 6) | (b[2] as u16 & 0x3f);
}
Utf16Char{ units: [unit, 0] }
},
}
}
}
impl From<Utf16Char> for char {
fn from(uc: Utf16Char) -> char {
char::from_utf16_array_unchecked(uc.to_array())
}
}
impl IntoIterator for Utf16Char {
type Item=u16;
type IntoIter=Utf16Iterator;
/// Iterate over the units.
fn into_iter(self) -> Utf16Iterator {
Utf16Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u16c in iter {
self.push(u16c.units[0]);
if u16c.units[1] != 0 {
self.push(u16c.units[1]);
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for Vec<u16> {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut vec = Vec::new();
vec.extend(iter);
return vec;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for Vec<u16> {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf16Char> for String {
fn extend<I:IntoIterator<Item=Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().map(Utf8Char::from));
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf16Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf16Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned());
}
}
#[cfg(feature="std")]
impl FromIterator<Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=Utf16Char>>(iter: I) -> Self {
let mut s = String::new();
s.extend(iter);
return s;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf16Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf16Char>>(iter: I) -> Self {
Self::from_iter(iter.into_iter().cloned())
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u16]> for Utf16Char {
#[inline]
fn as_ref(&self) -> &[u16] {
&self.units[..self.len()]
}
}
impl Borrow<[u16]> for Utf16Char {
#[inline]
fn borrow(&self) -> &[u16] {
self.as_ref()
}
}
impl Deref for Utf16Char {
type Target = [u16];
#[inline]
fn deref(&self) -> &[u16] {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf16Char {
#[inline]
fn from(ac: AsciiChar) -> Self {
Utf16Char{ units: [ac.as_byte() as u16, 0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf16Char {
#[inline]
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
self.units[0].to_ascii_char()
}
#[inline]
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
unsafe { self.units[0].to_ascii_char_unchecked() }
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf16Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf16Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Display::fmt(&Utf8Char::from(*self), fmtr)
}
}
// Cannot derive these impls because two-unit characters must always compare
// greater than one-unit ones.
impl PartialOrd for Utf16Char {
#[inline]
fn partial_cmp(&self, rhs: &Self) -> Option<Ordering> {
Some(self.cmp(rhs))
}
}
impl Ord for Utf16Char {
#[inline]
fn cmp(&self, rhs: &Self) -> Ordering {
// Shift the first unit by 0xd if surrogate, and 0 otherwise.
// This ensures surrogates are always greater than 0xffff, and
// that the second unit only affect the result when the first are equal.
// Multiplying by a constant factor isn't enough because that factor
// would have to be greater than 1023 and smaller than 5.5.
// This transformation is less complicated than combine_surrogates().
let lhs = (self.units[0] as u32, self.units[1] as u32);
let rhs = (rhs.units[0] as u32, rhs.units[1] as u32);
let lhs = (lhs.0 << (lhs.1 >> 12)) + lhs.1;
let rhs = (rhs.0 << (rhs.1 >> 12)) + rhs.1;
lhs.cmp(&rhs)
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf16Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf16Char::from(*u32c)
}
}
impl PartialEq<Utf16Char> for char {
fn eq(&self, u16c: &Utf16Char) -> bool {
Utf16Char::from(*self) == *u16c
}
}
impl PartialOrd<char> for Utf16Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u32c))
}
}
impl PartialOrd<Utf16Char> for char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
Utf16Char::from(*self).partial_cmp(u16c)
}
}
impl PartialEq<Utf8Char> for Utf16Char {
fn eq(&self, u8c: &Utf8Char) -> bool {
*self == Utf16Char::from(*u8c)
}
}
impl PartialOrd<Utf8Char> for Utf16Char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&Utf16Char::from(*u8c))
}
}
// The other direction is implemented in utf8_char.rs
/// Only considers the unit equal if the codepoint of the `Utf16Char` is not
/// made up of a surrogate pair.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf16Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6' as u16);
/// assert!(Utf16Char::from('\u{FFFF}') == 0xffff_u16);
/// assert!(Utf16Char::from_tuple((0xd876, Some(0xdef9))).unwrap() != 0xd876_u16);
/// ```
impl PartialEq<u16> for Utf16Char {
fn eq(&self, unit: &u16) -> bool {
self.units[0] == *unit && self.units[1] == 0
}
}
/// Only considers the byte equal if the codepoint of the `Utf16Char` is <= U+FF.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert!(Utf16Char::from('6') == b'6');
/// assert!(Utf16Char::from('\u{00FF}') == b'\xff');
/// assert!(Utf16Char::from('\u{0100}') != b'\0');
/// ```
impl PartialEq<u8> for Utf16Char {
fn eq(&self, byte: &u8) -> bool {
self.units[0] == *byte as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf16Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.units[0] == *ascii as u16
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII never compare equal.
impl PartialEq<Utf16Char> for AsciiChar {
#[inline]
fn eq(&self, u16c: &Utf16Char) -> bool {
*self as u16 == u16c.units[0]
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf16Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.units[0].partial_cmp(&(*ascii as u16))
}
}
#[cfg(feature = "ascii")]
/// `Utf16Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf16Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
(*self as u16).partial_cmp(&u16c.units[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf16Char {
/// A `const fn` alternative to the trait-based `Utf16Char::from(char)`.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// const REPLACEMENT_CHARACTER: Utf16Char = Utf16Char::new('\u{fffd}');
/// ```
pub const fn new(c: char) -> Self {
if c <= '\u{ffff}' {
Utf16Char{ units: [c as u16, 0] }
} else {
let c = (c as u32).wrapping_sub(0x01_00_00);
let first = 0xd8_00 | (c >> 10) as u16;
let second = 0xdc_00 | (c & 0x0_03_ff) as u16;
Utf16Char{ units: [first, second] }
}
}
/// Create an `Utf16Char` from the first codepoint in a string slice,
/// converting from UTF-8 to UTF-16.
///
/// The returned `usize` is the number of UTF-8 bytes used from the str,
/// and not the number of UTF-16 units.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
///
/// assert_eq!(Utf16Char::from_str_start("a"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("ab"), Ok((Utf16Char::from('a'),1)));
/// assert_eq!(Utf16Char::from_str_start("🂠 "), Ok((Utf16Char::from('🂠'),4)));
/// assert_eq!(Utf16Char::from_str_start("é"), Ok((Utf16Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf16Char::from_str_start("").is_err());
/// ```
pub const fn from_str_start(s: &str) -> Result<(Self,usize), EmptyStrError> {
if s.is_empty() {
return Err(EmptyStrError);
}
let b = s.as_bytes();
// Read the last byte first to reduce the number of unnecesary length checks.
match b[0] {
0..=127 => {// 1 byte => 1 unit
let unit = b[0] as u16;// 0b0000_0000_0xxx_xxxx
Ok((Utf16Char{ units: [unit, 0] }, 1))
},
0b1000_0000..=0b1101_1111 => {// 2 bytes => 1 unit
let unit = (((b[1] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[0] & 0x1f) as u16) << 6);// 0b0000_0xxx_xx00_0000
Ok((Utf16Char{ units: [unit, 0] }, 2))
},
0b1110_0000..=0b1110_1111 => {// 3 bytes => 1 unit
let unit = (((b[2] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[1] & 0x3f) as u16) << 6) // 0b0000_xxxx_xx00_0000
| (((b[0] & 0x0f) as u16) << 12);// 0bxxxx_0000_0000_0000
Ok((Utf16Char{ units: [unit, 0] }, 3))
},
_ => {// 4 bytes => 2 units
let second = 0xdc00 // 0b1101_1100_0000_0000
| (((b[3] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx
| (((b[2] & 0x0f) as u16) << 6);// 0b0000_00xx_xx00_0000
let first = 0xd800-(0x01_00_00u32>>10) as u16// 0b1101_0111_1100_0000
+ (((b[2] & 0x30) as u16) >> 4) // 0b0000_0000_0000_00xx
+ (((b[1] & 0x3f) as u16) << 2) // 0b0000_0000_xxxx_xx00
+ (((b[0] & 0x07) as u16) << 8); // 0b0000_0xxx_0000_0000
Ok((Utf16Char{ units: [first, second] }, 4))
}
}
}
/// Validate and store the first UTF-16 codepoint in the slice.
/// Also return how many units were needed.
pub fn from_slice_start(src: &[u16]) -> Result<(Self,usize), Utf16SliceError> {
char::from_utf16_slice_start(src).map(|(_,len)| {
let second = if len==2 {src[1]} else {0};
(Utf16Char{ units: [src[0], second] }, len)
})
}
/// Store the first UTF-16 codepoint of the slice.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-16 codepoint.
/// The length of the slice is never checked.
pub unsafe fn from_slice_start_unchecked(src: &[u16]) -> (Self,usize) {
unsafe {
let first = *src.get_unchecked(0);
if first.is_utf16_leading_surrogate() {
(Utf16Char{ units: [first, *src.get_unchecked(1)] }, 2)
} else {
(Utf16Char{ units: [first, 0] }, 1)
}
}
}
/// Validate and store an UTF-16 array as returned from `char.to_utf16_array()`.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf16Char;
/// use encode_unicode::error::Utf16ArrayError;
///
/// assert_eq!(Utf16Char::from_array(['x' as u16, 'y' as u16]), Ok(Utf16Char::from('x')));
/// assert_eq!(Utf16Char::from_array(['睷' as u16, 0]), Ok(Utf16Char::from('睷')));
/// assert_eq!(Utf16Char::from_array([0xda6f, 0xdcde]), Ok(Utf16Char::from('\u{abcde}')));
/// assert_eq!(Utf16Char::from_array([0xf111, 0xdbad]), Ok(Utf16Char::from('\u{f111}')));
/// assert_eq!(Utf16Char::from_array([0xdaaf, 0xdaaf]), Err(Utf16ArrayError::SecondIsNotTrailingSurrogate));
/// assert_eq!(Utf16Char::from_array([0xdcac, 0x9000]), Err(Utf16ArrayError::FirstIsTrailingSurrogate));
/// ```
pub const fn from_array(units: [u16; 2]) -> Result<Self,Utf16ArrayError> {
if (units[0] & 0xf8_00) != 0xd8_00 {
Ok(Utf16Char { units: [units[0], 0] })
} else if units[0] < 0xdc_00 && (units[1] & 0xfc_00) == 0xdc_00 {
Ok(Utf16Char { units })
} else if units[0] < 0xdc_00 {
Err(Utf16ArrayError::SecondIsNotTrailingSurrogate)
} else {
Err(Utf16ArrayError::FirstIsTrailingSurrogate)
}
}
/// Create an `Utf16Char` from an array as returned from `char.to_utf16_array()`.
///
/// # Safety
///
/// The units must form a valid codepoint, and the second unit must be 0
/// when a surrogate pair is not required.
/// Violating this can easily lead to undefined behavior, although unlike
/// `char` bad `Utf16Char`s simply existing is not immediately UB.
pub const unsafe fn from_array_unchecked(units: [u16; 2]) -> Self {
Utf16Char { units }
}
pub(crate) const fn validate_tuple(utf16: (u16,Option<u16>)) -> Result<(),Utf16TupleError> {
match utf16 {
(0x00_00..=0xd7_ff, None) | // single
(0xe0_00..=0xff_ff, None) | // single
(0xd8_00..=0xdb_ff, Some(0xdc_00..=0xdf_ff)) // correct surrogate
=> Ok(()),
(0xd8_00..=0xdb_ff, Some(_)) => Err(Utf16TupleError::SecondIsNotTrailingSurrogate),
(0xd8_00..=0xdb_ff, None ) => Err(Utf16TupleError::MissingSecond),
(0xdc_00..=0xdf_ff, _ ) => Err(Utf16TupleError::FirstIsTrailingSurrogate),
( _ , Some(_)) => Err(Utf16TupleError::SuperfluousSecond),
}
}
/// Validate and store a UTF-16 pair as returned from `char.to_utf16_tuple()`.
pub const fn from_tuple(utf16: (u16,Option<u16>)) -> Result<Self,Utf16TupleError> {
unsafe {
match Self::validate_tuple(utf16) {
Ok(()) => Ok(Self::from_tuple_unchecked(utf16)),
Err(e) => Err(e),
}
}
}
/// Create an `Utf16Char` from a tuple as returned from `char.to_utf16_tuple()`.
///
/// # Safety
///
/// The units must form a valid codepoint with the second being 0 when a
/// surrogate pair is not required.
/// Violating this can easily lead to undefined behavior.
pub const unsafe fn from_tuple_unchecked(utf16: (u16,Option<u16>)) -> Self {
let second = match utf16.1 {
Some(extra) => extra,
None => 0,
};
Utf16Char { units: [utf16.0, second] }
}
/// Create an `Utf16Char` from a single unit.
///
/// Codepoints less than `'\u{1_00_00}'` (which fit in an `u16`)
/// are part of the basic multilingual plane
/// unless they are reserved for surrogate pairs.
///
/// # Errors
///
/// Returns `NonBmpError` if the unit is in the range `0xd800..0xe000`
/// (which means that it's part of a surrogat pair)
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from_bmp(0x40).unwrap(), '@');
/// assert_eq!(Utf16Char::from_bmp('ø' as u16).unwrap(), 'ø');
/// assert!(Utf16Char::from_bmp(0xdddd).is_err());
/// ```
pub const fn from_bmp(bmp_codepoint: u16) -> Result<Self,NonBmpError> {
let is_not_bmp = bmp_codepoint & 0xf800 == 0xd800;
let if_good = Utf16Char{ units: [bmp_codepoint, 0] };
[Ok(if_good), Err(NonBmpError)][is_not_bmp as usize]
}
/// Create an `Utf16Char` from a single unit without checking that it's a
/// valid codepoint on its own.
///
/// # Safety
///
/// The unit must be less than 0xd800 or greater than 0xdfff.
/// In other words, not part of a surrogate pair.
/// Violating this can easily lead to undefined behavior.
#[inline]
pub const unsafe fn from_bmp_unchecked(bmp_codepoint: u16) -> Self {
Utf16Char{ units: [bmp_codepoint, 0] }
}
/// Checks that the codepoint is in the basic multilingual plane.
///
/// # Examples
/// ```
/// # use encode_unicode::Utf16Char;
/// assert_eq!(Utf16Char::from('e').is_bmp(), true);
/// assert_eq!(Utf16Char::from('€').is_bmp(), true);
/// assert_eq!(Utf16Char::from('𝔼').is_bmp(), false);
/// ```
#[inline]
pub const fn is_bmp(self) -> bool {
self.units[1] == 0
}
/// The number of units this character is made up of.
///
/// Is either 1 or 2 and identical to `.as_char().len_utf16()`
/// or `.as_ref().len()`.
#[inline]
pub const fn len(self) -> usize {
1 + (self.units[1] as usize >> 15)
}
// There is no `.is_emty()` because it would always return false.
/// Checks that the codepoint is an ASCII character.
#[inline]
pub const fn is_ascii(self) -> bool {
self.units[0] <= 127
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() && other.is_ascii() {
(self.units[0] as u8).eq_ignore_ascii_case(&(other.units[0] as u8))
} else {
self.units[0] == other.units[0] && self.units[1] == other.units[1]
}
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_uppercase(self) -> Self {
let n = self.units[0].wrapping_sub(b'a' as u16);
if n < 26 {Utf16Char{ units: [n+b'A' as u16, 0] }}
else {self}
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_lowercase(self) -> Self {
let n = self.units[0].wrapping_sub(b'A' as u16);
if n < 26 {Utf16Char{ units: [n+b'a' as u16, 0] }}
else {self}
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub fn make_ascii_uppercase(&mut self) {
*self = self.to_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub fn make_ascii_lowercase(&mut self) {
*self = self.to_ascii_lowercase();
}
/// Convert from UTF-16 to UTF-32
pub fn to_char(self) -> char {
self.into()
}
/// Write the internal representation to a slice,
/// and then returns the number of `u16`s written.
///
/// # Panics
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length two is always large enough.
pub fn to_slice(self, dst: &mut[u16]) -> usize {
// Write the last unit first to avoid repeated length checks.
let extra = self.units[1] as usize >> 15;
match dst.get_mut(extra) {
Some(first) => *first = self.units[extra],
None => panic!("The provided buffer is too small.")
}
if extra != 0 {dst[0] = self.units[0];}
extra+1
}
/// Get the character represented as an array of two units.
///
/// The second `u16` is zero for codepoints that fit in one unit.
#[inline]
pub const fn to_array(self) -> [u16;2] {
self.units
}
/// The second `u16` is used for surrogate pairs.
#[inline]
pub const fn to_tuple(self) -> (u16,Option<u16>) {
(self.units[0], [None, Some(self.units[1])][self.units[1] as usize >> 15])
}
}

View File

@@ -0,0 +1,265 @@
/* Copyright 2018-2019 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::traits::CharExt;
use crate::utf16_char::Utf16Char;
use crate::errors::EmptyStrError;
extern crate core;
use core::fmt;
use core::borrow::Borrow;
// Invalid values that says the field is consumed or empty.
const FIRST_USED: u16 = 0x_dc_00;
const SECOND_USED: u16 = 0;
/// Iterate over the units of the UTF-16 representation of a codepoint.
#[derive(Clone)]
pub struct Utf16Iterator {
first: u16,
second: u16,
}
impl From<char> for Utf16Iterator {
fn from(c: char) -> Self {
Self::from(c.to_utf16())
}
}
impl From<Utf16Char> for Utf16Iterator {
fn from(uc: Utf16Char) -> Self {
let (first, second) = uc.to_tuple();
let second = second.unwrap_or(SECOND_USED);
Utf16Iterator{first, second}
}
}
impl Iterator for Utf16Iterator {
type Item=u16;
fn next(&mut self) -> Option<u16> {
match (self.first, self.second) {
(FIRST_USED, SECOND_USED) => { None },
(FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)},
(first , _ ) => {self.first = FIRST_USED; Some(first )},
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf16Iterator {
fn len(&self) -> usize {
(if self.first == FIRST_USED {0} else {1}) +
(if self.second == SECOND_USED {0} else {1})
}
}
impl fmt::Debug for Utf16Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut clone = self.clone();
match (clone.next(), clone.next()) {
(Some(one), None) => write!(fmtr, "[{}]", one),
(Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
(None, _) => write!(fmtr, "[]"),
}
}
}
/// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
/// to an iterator of `u16`s.
///
/// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator,
/// but the returned iterator is about twice as fast.
///
/// The exact number of units cannot be known in advance, but `size_hint()`
/// gives the possible range.
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{IterExt, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf16() );
/// let mut units = [0; 4];
/// iterator.to_units().zip(&mut units).for_each(|(u,dst)| *dst = u );
/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt, Utf16Char};
///
/// // (💣 takes two units)
/// let chars: Vec<Utf16Char> = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect();
/// let units: Vec<u16> = chars.iter().to_units().collect();
/// let flat_map: Vec<u16> = chars.iter().cloned().flatten().collect();
/// assert_eq!(units, flat_map);
/// ```
#[derive(Clone)]
pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
inner: I,
prev_second: u16,
}
impl<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
From<I> for Utf16CharSplitter<U, I::IntoIter> {
fn from(iterable: I) -> Self {
Utf16CharSplitter { inner: iterable.into_iter(), prev_second: 0 }
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter.into_inner().to_units()` is not a no-op:
/// If the last returned unit from `next()` was a leading surrogate,
/// the trailing surrogate is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
type Item = u16;
fn next(&mut self) -> Option<Self::Item> {
if self.prev_second == 0 {
self.inner.next().map(|u16c| {
let units = u16c.borrow().to_array();
self.prev_second = units[1];
units[0]
})
} else {
let prev_second = self.prev_second;
self.prev_second = 0;
Some(prev_second)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = if self.prev_second == 0 {0} else {1};
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf16CharIndices<'a> {
fn from(s: &str) -> Utf16CharIndices {
Utf16CharIndices{str: s, index: 0}
}
}
impl<'a> Utf16CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf16CharIndices<'a> {
type Item = (usize,Utf16Char);
fn next(&mut self) -> Option<(usize,Utf16Char)> {
match Utf16Char::from_str_start(&self.str[self.index..]) {
Ok((u16c, bytes)) => {
let item = (self.index, u16c);
self.index += bytes;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u16c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf16CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
impl<'a> From<&'a str> for Utf16Chars<'a> {
fn from(s: &str) -> Utf16Chars {
Utf16Chars(Utf16CharIndices::from(s))
}
}
impl<'a> Utf16Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16chars();
/// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf16Chars<'a> {
type Item = Utf16Char;
fn next(&mut self) -> Option<Utf16Char> {
self.0.next().map(|(_,u16c)| u16c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
fn next_back(&mut self) -> Option<Utf16Char> {
self.0.next_back().map(|(_,u16c)| u16c )
}
}
impl<'a> fmt::Debug for Utf16Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16Chars")
.field(&self.as_str())
.finish()
}
}

647
vendor/encode_unicode/src/utf8_char.rs vendored Normal file
View File

@@ -0,0 +1,647 @@
/* Copyright 2016-2022 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::errors::{FromStrError, EmptyStrError, NonAsciiError, Utf8Error};
use crate::utf8_iterators::Utf8Iterator;
use crate::traits::{CharExt, U8UtfExt};
use crate::utf16_char::Utf16Char;
extern crate core;
use core::{hash, fmt, str, ptr};
use core::cmp::Ordering;
use core::borrow::Borrow;
use core::ops::Deref;
#[cfg(feature="std")]
use core::iter::FromIterator;
#[cfg(feature="ascii")]
extern crate ascii;
#[cfg(feature="ascii")]
use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError};
// I don't think there is any good default value for char, but char does.
#[derive(Default)]
// char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either.
// The default impl of Ord for arrays works out because longer codepoints
// start with more ones, so if they're equal, the length is the same,
// breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code.
#[derive(PartialEq,Eq, PartialOrd,Ord)]
#[derive(Clone,Copy)]
/// An unicode codepoint stored as UTF-8.
///
/// It can be borrowed as a `str`, and has the same size as `char`.
pub struct Utf8Char {
bytes: [u8; 4],
}
/////////////////////
//conversion traits//
/////////////////////
impl str::FromStr for Utf8Char {
type Err = FromStrError;
/// Create an `Utf8Char` from a string slice.
/// The string must contain exactly one codepoint.
///
/// # Examples
///
/// ```
/// use encode_unicode::error::FromStrError::*;
/// use encode_unicode::Utf8Char;
/// use std::str::FromStr;
///
/// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a')));
/// assert_eq!(Utf8Char::from_str("🂠"), Ok(Utf8Char::from('🂠')));
/// assert_eq!(Utf8Char::from_str(""), Err(Empty));
/// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints));
/// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark
/// ```
fn from_str(s: &str) -> Result<Self, FromStrError> {
if s.is_empty() {
Err(FromStrError::Empty)
} else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() {
Err(FromStrError::MultipleCodepoints)
} else {
let mut bytes = [0; 4];
bytes[..s.len()].copy_from_slice(s.as_bytes());
Ok(Utf8Char{bytes})
}
}
}
impl From<Utf16Char> for Utf8Char {
fn from(utf16: Utf16Char) -> Utf8Char {
match utf16.to_tuple() {
(ascii @ 0..=0x00_7f, _) => {
Utf8Char{ bytes: [ascii as u8, 0, 0, 0] }
},
(unit @ 0..=0x07_ff, _) => {
let byte2 = 0x80 | (unit & 0x00_3f) as u8;
let byte1 = 0xc0 | ((unit & 0x07_c0) >> 6) as u8;
Utf8Char{ bytes: [byte1, byte2, 0, 0] }
},
(unit, None) => {
let byte3 = 0x80 | (unit & 0x00_3f) as u8;
let byte2 = 0x80 | ((unit & 0x0f_c0) >> 6) as u8;
let byte1 = 0xe0 | ((unit & 0xf0_00) >> 12) as u8;
Utf8Char{ bytes: [byte1, byte2, byte3, 0] }
},
(first, Some(second)) => {
let first = first + (0x01_00_00u32 >> 10) as u16;
let byte4 = 0x80 | (second & 0x00_3f) as u8;
let byte3 = 0x80 | ((second & 0x03_c0) >> 6) as u8
| (( first & 0x00_03) << 4) as u8;
let byte2 = 0x80 | (( first & 0x00_fc) >> 2) as u8;
let byte1 = 0xf0 | (( first & 0x07_00) >> 8) as u8;
Utf8Char{ bytes: [byte1, byte2, byte3, byte4] }
}
}
}
}
impl From<char> for Utf8Char {
fn from(c: char) -> Self {
Utf8Char::new(c)
}
}
impl From<Utf8Char> for char {
fn from(uc: Utf8Char) -> char {
uc.to_char()
}
}
impl IntoIterator for Utf8Char {
type Item=u8;
type IntoIter=Utf8Iterator;
/// Iterate over the byte values.
fn into_iter(self) -> Utf8Iterator {
Utf8Iterator::from(self)
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
let iter = iter.into_iter();
self.reserve(iter.size_hint().0);
for u8c in iter {
// twice as fast as self.extend_from_slice(u8c.as_bytes());
self.push(u8c.bytes[0]);
for &extra in &u8c.bytes[1..] {
if extra != 0 {
self.push(extra);
}
}
}
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for Vec<u8> {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl Extend<Utf8Char> for String {
fn extend<I:IntoIterator<Item=Utf8Char>>(&mut self, iter: I) {
unsafe { self.as_mut_vec().extend(iter) }
}
}
#[cfg(feature="std")]
impl<'a> Extend<&'a Utf8Char> for String {
fn extend<I:IntoIterator<Item=&'a Utf8Char>>(&mut self, iter: I) {
self.extend(iter.into_iter().cloned())
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> String {
let mut string = String::new();
string.extend(iter);
return string;
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for String {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> String {
iter.into_iter().cloned().collect()
}
}
#[cfg(feature="std")]
impl FromIterator<Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=Utf8Char>>(iter: I) -> Self {
iter.into_iter().collect::<String>().into_bytes()
}
}
#[cfg(feature="std")]
impl<'a> FromIterator<&'a Utf8Char> for Vec<u8> {
fn from_iter<I:IntoIterator<Item=&'a Utf8Char>>(iter: I) -> Self {
iter.into_iter().cloned().collect::<String>().into_bytes()
}
}
/////////////////
//getter traits//
/////////////////
impl AsRef<[u8]> for Utf8Char {
fn as_ref(&self) -> &[u8] {
&self.bytes[..self.len()]
}
}
impl AsRef<str> for Utf8Char {
fn as_ref(&self) -> &str {
unsafe{ str::from_utf8_unchecked( self.as_ref() ) }
}
}
impl Borrow<[u8]> for Utf8Char {
fn borrow(&self) -> &[u8] {
self.as_ref()
}
}
impl Borrow<str> for Utf8Char {
fn borrow(&self) -> &str {
self.as_ref()
}
}
impl Deref for Utf8Char {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_ref()
}
}
////////////////
//ascii traits//
////////////////
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl From<AsciiChar> for Utf8Char {
fn from(ac: AsciiChar) -> Self {
Utf8Char{ bytes: [ac.as_byte(),0,0,0] }
}
}
#[cfg(feature="ascii")]
/// Requires the feature "ascii".
impl ToAsciiChar for Utf8Char {
fn to_ascii_char(self) -> Result<AsciiChar, ToAsciiCharError> {
self.bytes[0].to_ascii_char()
}
unsafe fn to_ascii_char_unchecked(self) -> AsciiChar {
unsafe { self.bytes[0].to_ascii_char_unchecked() }
}
}
/////////////////////////////////////////////////////////
//Genaral traits that cannot be derived to emulate char//
/////////////////////////////////////////////////////////
impl hash::Hash for Utf8Char {
fn hash<H : hash::Hasher>(&self, state: &mut H) {
self.to_char().hash(state);
}
}
impl fmt::Debug for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmt::Debug::fmt(&self.to_char(), fmtr)
}
}
impl fmt::Display for Utf8Char {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.write_str(self.as_str())
}
}
////////////////////////////////
//Comparisons with other types//
////////////////////////////////
impl PartialEq<char> for Utf8Char {
fn eq(&self, u32c: &char) -> bool {
*self == Utf8Char::from(*u32c)
}
}
impl PartialEq<Utf8Char> for char {
fn eq(&self, u8c: &Utf8Char) -> bool {
Utf8Char::from(*self) == *u8c
}
}
impl PartialOrd<char> for Utf8Char {
fn partial_cmp(&self, u32c: &char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u32c))
}
}
impl PartialOrd<Utf8Char> for char {
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
Utf8Char::from(*self).partial_cmp(u8c)
}
}
impl PartialEq<Utf16Char> for Utf8Char {
fn eq(&self, u16c: &Utf16Char) -> bool {
*self == Self::from(*u16c)
}
}
impl PartialOrd<Utf16Char> for Utf8Char {
fn partial_cmp(&self, u16c: &Utf16Char) -> Option<Ordering> {
self.partial_cmp(&Self::from(*u16c))
}
}
// The other direction is implemented in utf16_char.rs
/// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters.
///
/// There is no impl in the opposite direction, as this should only be used to
/// compare `Utf8Char`s against constants.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert!(Utf8Char::from('8') == b'8');
/// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1);
/// assert!(Utf8Char::from('\u{ff}') != 0xff);
/// assert!(Utf8Char::from('\u{80}') != 0x80);
/// ```
impl PartialEq<u8> for Utf8Char {
fn eq(&self, byte: &u8) -> bool {
self.bytes[0] == *byte && self.bytes[1] == 0
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<AsciiChar> for Utf8Char {
#[inline]
fn eq(&self, ascii: &AsciiChar) -> bool {
self.bytes[0] == *ascii as u8
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII never compare equal.
impl PartialEq<Utf8Char> for AsciiChar {
#[inline]
fn eq(&self, u8c: &Utf8Char) -> bool {
u8c == self
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<AsciiChar> for Utf8Char {
#[inline]
fn partial_cmp(&self, ascii: &AsciiChar) -> Option<Ordering> {
self.bytes[0].partial_cmp(ascii)
}
}
#[cfg(feature = "ascii")]
/// `Utf8Char`s that are not ASCII always compare greater.
impl PartialOrd<Utf8Char> for AsciiChar {
#[inline]
fn partial_cmp(&self, u8c: &Utf8Char) -> Option<Ordering> {
self.partial_cmp(&u8c.bytes[0])
}
}
///////////////////////////////////////////////////////
//pub impls that should be together for nicer rustdoc//
///////////////////////////////////////////////////////
impl Utf8Char {
/// A `const fn` alternative to the trait-based `Utf8Char::from(char)`.
///
/// # Example
///
/// ```
/// # use encode_unicode::Utf8Char;
/// const REPLACEMENT_CHARACTER: Utf8Char = Utf8Char::new('\u{fffd}');
/// ```
pub const fn new(c: char) -> Self {
if c.is_ascii() {
Utf8Char{bytes: [c as u8, 0, 0, 0]}
} else {
// How many extra UTF-8 bytes that are needed to represent an
// UTF-32 codepoint with a number of bits.
// Stored as a bit-packed array using two bits per value.
// 0..=7 bits = no extra bytes
// +4 = 8..=11 bits = one xtra byte (5+6 bits)
// +5 = 12..=16 bits = two extra bytes (4+6+6 bits)
// +5 = 17..=21 bits = three extra bytes (3+6+6+6 bits)
const EXTRA_BYTES: u64 = 0b11_11_11_11_11__10_10_10_10_10__01_01_01_01__00_00_00_00_00_00_00__00;
let bits_used = 32 - (c as u32).leading_zeros();
let len = 1 + ((EXTRA_BYTES >> (bits_used*2)) & 0b11);
// copied from CharExt::to_utf8_array()
let mut c = c as u32;
let mut parts = 0;// convert to 6-bit bytes
parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f; c>>=6;
parts<<=8; parts |= c & 0x3f;
parts |= 0x80_80_80_80;// set the most significant bit
parts >>= 8*(4-len);// right-align bytes
// Now, unused bytes are zero, (which matters for Utf8Char.eq())
// and the rest are 0b10xx_xxxx
// set header on first byte
parts |= (0xff_00u32 >> len) & 0xff;// store length
parts &= !(1u32 << (7-len));// clear the next bit after it
Utf8Char {bytes: parts.to_le_bytes()}
}
}
/// Create an `Utf8Char` from the first codepoint in a `str`.
///
/// Returns an error if the `str` is empty.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
///
/// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1)));
/// assert_eq!(Utf8Char::from_str_start("🂠 "), Ok((Utf8Char::from('🂠'),4)));
/// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark
/// assert!(Utf8Char::from_str_start("").is_err());
/// ```
pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> {
unsafe {
if src.is_empty() {
Err(EmptyStrError)
} else {
Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes()))
}
}
}
/// Create an `Utf8Char` of the first codepoint in an UTF-8 slice.
/// Also returns the length of the UTF-8 sequence for the codepoint.
///
/// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation.
///
/// # Errors
///
/// Returns an `Err` if the slice is empty, doesn't start with a valid
/// UTF-8 sequence or is too short for the sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::Utf8ErrorKind::*;
///
/// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1)));
/// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2)));
///
/// assert_eq!(Utf8Char::from_slice_start(&[]).unwrap_err().kind(), TooFewBytes);
/// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]).unwrap_err().kind(), TooFewBytes);
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]).unwrap_err().kind(), InterruptedSequence);
/// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]).unwrap_err().kind(), InterruptedSequence);
/// ```
pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),Utf8Error> {
char::from_utf8_slice_start(src).map(|(_,len)| {
let mut bytes = [0; 4];
bytes[..len].copy_from_slice(&src[..len]);
(Utf8Char{bytes}, len)
})
}
/// A `from_slice_start()` that doesn't validate the codepoint.
///
/// # Safety
///
/// The slice must be non-empty and start with a valid UTF-8 codepoint.
/// Invalid or incomplete values might cause reads of uninitalized memory.
pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) {
unsafe {
let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked();
let mut bytes = [0; 4];
ptr::copy_nonoverlapping(src.as_ptr(), bytes.as_mut_ptr() as *mut u8, len);
(Utf8Char{bytes}, len)
}
}
/// Create an `Utf8Char` from a byte array after validating it.
///
/// The codepoint must start at the first byte.
/// Unused bytes are set to zero by this function and so can be anything.
///
/// # Errors
///
/// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence.
///
/// # Examples
///
/// ```
/// use encode_unicode::Utf8Char;
/// use encode_unicode::error::Utf8ErrorKind::*;
///
/// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}')));
/// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A')));
/// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0')));
///
/// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]).unwrap_err().kind(), InterruptedSequence);
/// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]).unwrap_err().kind(), NonUtf8Byte);
/// assert_eq!(Utf8Char::from_array([0xe0, 0x9a, 0xbf, 0]).unwrap_err().kind(), OverlongEncoding);
/// assert_eq!(Utf8Char::from_array([0xf4, 0xaa, 0x99, 0x88]).unwrap_err().kind(), TooHighCodepoint);
/// ```
pub fn from_array(utf8: [u8;4]) -> Result<Self,Utf8Error> {
// perform all validation
char::from_utf8_array(utf8)?;
let extra = utf8[0].extra_utf8_bytes_unchecked() as u32;
// zero unused bytes in one operation by transmuting the arrary to
// u32, apply an endian-corrected mask and transmute back
let mask = u32::from_le(0xff_ff_ff_ff >> (8*(3-extra)));
let unused_zeroed = mask & u32::from_ne_bytes(utf8); // native endian
Ok(Utf8Char{ bytes: unused_zeroed.to_ne_bytes() })
}
/// Zero-cost constructor.
///
/// # Safety
///
/// Must contain a valid codepoint starting at the first byte, with the
/// unused bytes zeroed.
/// Bad values can easily lead to undefined behavior.
#[inline]
pub const unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self {
Utf8Char{ bytes: utf8 }
}
/// Create an `Utf8Char` from a single byte.
///
/// The byte must be an ASCII character.
///
/// # Errors
///
/// Returns `NonAsciiError` if the byte greater than 127.
///
/// # Examples
///
/// ```
/// # use encode_unicode::Utf8Char;
/// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a');
/// assert!(Utf8Char::from_ascii(128).is_err());
/// ```
pub const fn from_ascii(ascii: u8) -> Result<Self,NonAsciiError> {
[Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] }), Err(NonAsciiError)][(ascii >> 7) as usize]
}
/// Create an `Utf8Char` from a single byte without checking that it's a
/// valid codepoint on its own, which is only true for ASCII characters.
///
/// # Safety
///
/// The byte must be less than 128.
#[inline]
pub const unsafe fn from_ascii_unchecked(ascii: u8) -> Self {
Utf8Char{ bytes: [ascii, 0, 0, 0] }
}
/// The number of bytes this character needs.
///
/// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or
/// `.as_char().len_utf8()`.
#[inline]
pub const fn len(self) -> usize {
// Invariants of the extra bytes enambles algorithms that
// `u8.extra_utf8_bytes_unchecked()` cannot use.
// Some of them turned out to require fewer x86 instructions:
// Exploits that unused bytes are zero and calculates the number of
// trailing zero bytes.
// Setting a bit in the first byte prevents the function from returning
// 0 for '\0' (which has 32 leading zeros).
// trailing and leading is swapped below to optimize for little-endian
// architectures.
(4 - (u32::from_le_bytes(self.bytes)|1).leading_zeros()/8) as usize
// Exploits that the extra bytes have their most significant bit set if
// in use.
// Takes fewer instructions than the one above if popcnt can be used,
// (which it cannot by default,
// set RUSTFLAGS='-C target-cpu=native' to enable)
//let all = u32::from_ne_bytes(self.bytes);
//let msb_mask = u32::from_be(0x00808080);
//let add_one = u32::from_be(0x80000000);
//((all & msb_mask) | add_one).count_ones() as usize
}
// There is no .is_emty() because this type is never empty.
/// Checks that the codepoint is an ASCII character.
pub const fn is_ascii(self) -> bool {
self.bytes[0].is_ascii()
}
/// Checks that two characters are an ASCII case-insensitive match.
///
/// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`.
pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
if self.is_ascii() {
self.bytes[0].eq_ignore_ascii_case(&other.bytes[0])
} else {
// [u8; 4] can't be const compared as of Rust 1.60, but u32 can
u32::from_le_bytes(self.bytes) == u32::from_le_bytes(other.bytes)
}
}
/// Converts the character to its ASCII upper case equivalent.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_uppercase(mut self) -> Self {
self.bytes[0] = self.bytes[0].to_ascii_uppercase();
self
}
/// Converts the character to its ASCII lower case equivalent.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
pub const fn to_ascii_lowercase(mut self) -> Self {
self.bytes[0] = self.bytes[0].to_ascii_lowercase();
self
}
/// Converts the character to its ASCII upper case equivalent in-place.
///
/// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
/// but non-ASCII letters are unchanged.
#[inline]
pub fn make_ascii_uppercase(&mut self) {
self.bytes[0].make_ascii_uppercase()
}
/// Converts the character to its ASCII lower case equivalent in-place.
///
/// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
/// but non-ASCII letters are unchanged.
#[inline]
pub fn make_ascii_lowercase(&mut self) {
self.bytes[0].make_ascii_lowercase();
}
/// Convert from UTF-8 to UTF-32
pub fn to_char(self) -> char {
unsafe { char::from_utf8_exact_slice_unchecked(&self.bytes[..self.len()]) }
}
/// Write the internal representation to a slice,
/// and then returns the number of bytes written.
///
/// # Panics
///
/// Will panic the buffer is too small;
/// You can get the required length from `.len()`,
/// but a buffer of length four is always large enough.
pub fn to_slice(self, dst: &mut[u8]) -> usize {
if self.len() > dst.len() {
panic!("The provided buffer is too small.");
}
dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]);
self.len()
}
/// Expose the internal array and the number of used bytes.
pub const fn to_array(self) -> ([u8;4],usize) {
(self.bytes, self.len())
}
/// Return a `str` view of the array the codepoint is stored as.
///
/// Is an unambiguous version of `.as_ref()`.
pub fn as_str(&self) -> &str {
self.deref()
}
}

View File

@@ -0,0 +1,346 @@
/* Copyright 2018-2020 Torbjørn Birch Moltu
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use crate::utf8_char::Utf8Char;
use crate::errors::EmptyStrError;
extern crate core;
use core::{u32, u64};
use core::ops::Not;
use core::fmt;
use core::borrow::Borrow;
#[cfg(feature="std")]
use std::io::{Read, Error as ioError};
/// Read or iterate over the bytes of the UTF-8 representation of a codepoint.
#[derive(Clone)]
pub struct Utf8Iterator (u32);
impl From<Utf8Char> for Utf8Iterator {
fn from(uc: Utf8Char) -> Self {
let used = u32::from_le_bytes(uc.to_array().0);
// uses u64 because shifting an u32 by 32 bits is a no-op.
let unused_set = (u64::MAX << (uc.len() as u64*8)) as u32;
Utf8Iterator(used | unused_set)
}
}
impl From<char> for Utf8Iterator {
fn from(c: char) -> Self {
Self::from(Utf8Char::from(c))
}
}
impl Iterator for Utf8Iterator {
type Item=u8;
fn next(&mut self) -> Option<u8> {
let next = self.0 as u8;
if next == 0xff {
None
} else {
self.0 = (self.0 >> 8) | 0xff_00_00_00;
Some(next)
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf8Iterator {
fn len(&self) -> usize {// not straightforward, but possible
let unused_bytes = self.0.not().leading_zeros() / 8;
4 - unused_bytes as usize
}
}
#[cfg(feature="std")]
impl Read for Utf8Iterator {
/// Always returns Ok
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
// Cannot call self.next() until I know I can write the result.
for (i, dst) in buf.iter_mut().enumerate() {
match self.next() {
Some(b) => *dst = b,
None => return Ok(i),
}
}
Ok(buf.len())
}
}
impl fmt::Debug for Utf8Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut content = [0; 4];
let mut i = 0;
for b in self.clone() {
content[i] = b;
i += 1;
}
write!(fmtr, "{:?}", &content[..i])
}
}
/// Converts an iterator of `Utf8Char` (or `&Utf8Char`)
/// to an iterator of `u8`s.
///
/// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator,
/// but the returned iterator is ~40% faster.
///
/// The iterator also implements `Read` (if the `std` feature isn't disabled).
/// Reading will never produce an error, and calls to `.read()` and `.next()`
/// can be mixed.
///
/// The exact number of bytes cannot be known in advance, but `size_hint()`
/// gives the possible range.
/// (min: all remaining characters are ASCII, max: all require four bytes)
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{IterExt, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf8() );
/// let mut bytes = [0; 4];
/// iterator.to_bytes().zip(&mut bytes).for_each(|(b,dst)| *dst = b );
/// assert_eq!(&bytes, b"foo\0");
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt, Utf8Char};
///
/// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect();
/// let bytes: Vec<u8> = chars.iter().to_bytes().collect();
/// let flat_map: Vec<u8> = chars.iter().cloned().flatten().collect();
/// assert_eq!(bytes, flat_map);
/// ```
///
/// `Read`ing from it:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{IterExt, CharExt};
/// use std::io::Read;
///
/// let s = "Ååh‽";
/// assert_eq!(s.len(), 8);
/// let mut buf = [b'E'; 9];
/// let mut reader = s.chars().map(|c| c.to_utf8() ).to_bytes();
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
/// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
/// assert_eq!(&buf[..8], s.as_bytes());
/// assert_eq!(buf[8], b'E');
/// ```
#[derive(Clone)]
pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
inner: I,
prev: u32,
}
impl<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
From<I> for Utf8CharSplitter<U,I::IntoIter> {
fn from(iterable: I) -> Self {
Utf8CharSplitter { inner: iterable.into_iter(), prev: 0 }
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter.into_inner().to_bytes()` is not a no-op:
/// If the last returned byte from `next()` was not an ASCII character,
/// the remaining bytes of that codepoint is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if self.prev == 0 {
self.inner.next().map(|u8c| {
let array = u8c.borrow().to_array().0;
self.prev = u32::from_le_bytes(array) >> 8;
array[0]
})
} else {
let next = self.prev as u8;
self.prev >>= 8;
Some(next)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = 4 - (self.prev.leading_zeros() / 8) as usize;
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
}
}
#[cfg(feature="std")]
impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
/// Always returns `Ok`
fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> {
let mut i = 0;
// write remaining bytes of previous codepoint
while self.prev != 0 && i < buf.len() {
buf[i] = self.prev as u8;
self.prev >>= 8;
i += 1;
}
// write whole characters
while i < buf.len() {
let bytes = match self.inner.next() {
Some(u8c) => u8c.borrow().to_array().0,
None => break
};
buf[i] = bytes[0];
i += 1;
if bytes[1] != 0 {
let len = bytes[0].not().leading_zeros() as usize;
let mut written = 1;
while written < len {
if i < buf.len() {
buf[i] = bytes[written];
i += 1;
written += 1;
} else {
let bytes_as_u32 = u32::from_le_bytes(bytes);
self.prev = bytes_as_u32 >> (8*written);
return Ok(i);
}
}
}
}
Ok(i)
}
}
/// An iterator over the `Utf8Char` of a string slice, and their positions.
///
/// This struct is created by the `utf8char_indices()` method from [`StrExt`](../trait.StrExt.html)
/// trait. See its documentation for more.
#[derive(Clone)]
pub struct Utf8CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf8CharIndices<'a> {
fn from(s: &str) -> Utf8CharIndices {
Utf8CharIndices{str: s, index: 0}
}
}
impl<'a> Utf8CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf8Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf8CharIndices<'a> {
type Item = (usize,Utf8Char);
fn next(&mut self) -> Option<(usize,Utf8Char)> {
match Utf8Char::from_str_start(&self.str[self.index..]) {
Ok((u8c, len)) => {
let item = (self.index, u8c);
self.index += len;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
// Cannot refactor out the unwrap without switching to ::from_slice()
// since slicing the str panics if not on a boundary.
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u8c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf8CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf8Char`.
#[derive(Clone)]
pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
impl<'a> From<&'a str> for Utf8Chars<'a> {
fn from(s: &str) -> Utf8Chars {
Utf8Chars(Utf8CharIndices::from(s))
}
}
impl<'a> Utf8Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf8Char};
/// let mut iter = "abc".utf8chars();
/// assert_eq!(iter.next(), Some(Utf8Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf8Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf8Chars<'a> {
type Item = Utf8Char;
fn next(&mut self) -> Option<Utf8Char> {
self.0.next().map(|(_,u8c)| u8c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
fn next_back(&mut self) -> Option<Utf8Char> {
self.0.next_back().map(|(_,u8c)| u8c )
}
}
impl<'a> fmt::Debug for Utf8Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf8CharIndices")
.field(&self.as_str())
.finish()
}
}