chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,44 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module provides a data structure for a space-efficient and time-efficient lookup of
//! sequences of 16-bit units (commonly but not necessarily UTF-16 code units)
//! which map to integer values.
//!
//! It is an implementation of the existing [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
//! / [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
//!
//! ## Architecture
//!
//! ICU4X [`Char16Trie`] is designed to provide a read-only view of `UCharsTrie` data that is exported from ICU4C.
//!
//! ## Examples
//!
//! ### Querying a `Char16Trie`
//!
//! ```rust
//! use icu::collections::char16trie::{Char16Trie, TrieResult};
//! use zerovec::ZeroVec;
//!
//! // A Char16Trie containing the ASCII characters mapping 'a' to 1 and 'ab'
//! // to 100.
//! let trie_data = [48, 97, 176, 98, 32868];
//! let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
//!
//! let mut iter = trie.iter();
//! let res = iter.next('a');
//! assert_eq!(res, TrieResult::Intermediate(1));
//! let res = iter.next('b');
//! assert_eq!(res, TrieResult::FinalValue(100));
//! let res = iter.next('c');
//! assert_eq!(res, TrieResult::NoMatch);
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
mod trie;
pub use trie::Char16Trie;
pub use trie::Char16TrieIterator;
pub use trie::TrieResult;

View File

@@ -0,0 +1,493 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use zerofrom::ZeroFrom;
use zerovec::{ZeroSlice, ZeroVec};
// Match-node lead unit values, after masking off intermediate-value bits:
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next byte.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5;
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
const MIN_LINEAR_MATCH: u16 = 0x30;
const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10;
// Match-node lead unit bits 14..6 for the optional intermediate value.
// If these bits are 0, then there is no intermediate value.
// Otherwise, see the *NodeValue* constants below.
const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40
const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f
// A final-value node has bit 15 set.
const VALUE_IS_FINAL: u16 = 0x8000;
// Compact value: After testing bit 0, shift right by 15 and then use the following thresholds.
const MAX_ONE_UNIT_VALUE: u16 = 0x3fff;
const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000
const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff;
const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040
const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0;
const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff;
// Compact delta integers.
const MAX_ONE_UNIT_DELTA: u16 = 0xfbff;
const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00
const THREE_UNIT_DELTA_LEAD: u16 = 0xffff;
fn skip_value(pos: usize, lead: u16) -> usize {
if lead < MIN_TWO_UNIT_VALUE_LEAD {
pos
} else if lead < THREE_UNIT_VALUE_LEAD {
pos + 1
} else {
pos + 2
}
}
fn skip_node_value(pos: usize, lead: u16) -> usize {
if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD {
pos
} else if lead < THREE_UNIT_NODE_VALUE_LEAD {
pos + 1
} else {
pos + 2
}
}
/// This struct represents a de-serialized `Char16Trie` that was exported from
/// ICU binary data.
///
/// Light-weight, non-const reader class for a `CharsTrie`. Traverses a
/// char-serialized data structure with minimal state, for mapping 16-bit-unit
/// sequences to non-negative integer values.
///
/// For more information:
/// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
/// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = icu_collections::char16trie))]
#[derive(Clone, Debug, PartialEq, Eq, ZeroFrom)]
pub struct Char16Trie<'data> {
/// An array of u16 containing the trie data.
#[cfg_attr(feature = "serde", serde(borrow))]
#[doc(hidden)] // #2417
pub data: ZeroVec<'data, u16>,
}
impl<'data> Char16Trie<'data> {
/// Returns a new [`Char16Trie`] with ownership of the provided data.
#[inline]
pub fn new(data: ZeroVec<'data, u16>) -> Self {
Self { data }
}
/// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data
#[inline]
pub fn iter(&self) -> Char16TrieIterator<'_> {
Char16TrieIterator::new(&self.data)
}
}
/// This struct represents an iterator over a [`Char16Trie`].
#[derive(Clone)]
pub struct Char16TrieIterator<'a> {
/// A reference to the Char16Trie data to iterate over.
trie: &'a ZeroSlice<u16>,
/// Index of next trie unit to read, or `None` if there are no more matches.
pos: Option<usize>,
/// Remaining length of a linear-match node, minus 1, or `None` if not in
/// such a node.
remaining_match_length: Option<usize>,
}
/// An enum representing the return value from a lookup in [`Char16Trie`].
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum TrieResult {
/// The input unit(s) did not continue a matching string.
/// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()`
/// will also return `TrieResult::NoMatch`.
NoMatch,
/// The input unit(s) matched a string but there is no value for the string
/// so far. (It is a prefix of a longer string.)
NoValue,
/// The input unit(s) continued a matching string and there is a value for
/// the string so far. No further input byte/unit can continue a matching
/// string.
FinalValue(i32),
/// The input unit(s) continued a matching string and there is a value for
/// the string so far. Another input byte/unit can continue a matching
/// string.
Intermediate(i32),
}
// Get the lead surrogate (0xd800..0xdbff) for a
// supplementary code point (0x10000..0x10ffff).
// @param supplementary 32-bit code point (U+10000..U+10ffff)
// @return lead surrogate (U+d800..U+dbff) for supplementary
fn u16_lead(supplementary: i32) -> u16 {
(((supplementary) >> 10) + 0xd7c0) as u16
}
// Get the trail surrogate (0xdc00..0xdfff) for a
// supplementary code point (0x10000..0x10ffff).
// @param supplementary 32-bit code point (U+10000..U+10ffff)
// @return trail surrogate (U+dc00..U+dfff) for supplementary
fn u16_tail(supplementary: i32) -> u16 {
(((supplementary) & 0x3ff) | 0xdc00) as u16
}
/// A macro that takes an `Option` argument and either unwraps it if it has a value or
/// causes the function to return `TrieResult::NoMatch` if there is no value.
/// This could perhaps be done with `std::ops::Try` once stabilized.
macro_rules! trie_unwrap {
($option:expr) => {
match $option {
Some(x) => x,
None => {
// Unexpected
debug_assert!(false);
return TrieResult::NoMatch;
}
}
};
}
impl<'a> Char16TrieIterator<'a> {
/// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array
#[inline]
pub fn new(trie: &'a ZeroSlice<u16>) -> Self {
Self {
trie,
pos: Some(0),
remaining_match_length: None,
}
}
/// Traverses the trie from the current state for this input char.
///
/// # Examples
///
/// ```
/// use icu::collections::char16trie::{Char16Trie, TrieResult};
/// use zerovec::ZeroVec;
///
/// // A Char16Trie containing the ASCII characters 'a' and 'b'.
/// let trie_data = [48, 97, 176, 98, 32868];
/// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
///
/// let mut iter = trie.iter();
/// let res = iter.next('a');
/// assert_eq!(res, TrieResult::Intermediate(1));
/// let res = iter.next('b');
/// assert_eq!(res, TrieResult::FinalValue(100));
/// let res = iter.next('c');
/// assert_eq!(res, TrieResult::NoMatch);
/// ```
pub fn next(&mut self, c: char) -> TrieResult {
if (c as u32) <= 0xffff {
self.next16(c as u16)
} else {
match self.next16(u16_lead(c as i32)) {
TrieResult::NoValue | TrieResult::Intermediate(_) => {
self.next16(u16_tail(c as i32))
}
_ => TrieResult::NoMatch,
}
}
}
/// Traverses the trie from the current state for this input char.
///
/// # Examples
///
/// ```
/// use icu::collections::char16trie::{Char16Trie, TrieResult};
/// use zerovec::ZeroVec;
///
/// // A Char16Trie containing the ASCII characters 'a' and 'b'.
/// let trie_data = [48, 97, 176, 98, 32868];
/// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
///
/// let mut iter = trie.iter();
/// let res = iter.next('a');
/// assert_eq!(res, TrieResult::Intermediate(1));
/// let res = iter.next('b');
/// assert_eq!(res, TrieResult::FinalValue(100));
/// let res = iter.next('c');
/// assert_eq!(res, TrieResult::NoMatch);
/// ```
pub fn next32(&mut self, c: u32) -> TrieResult {
if c <= 0xffff {
self.next16(c as u16)
} else {
match self.next16(u16_lead(c as i32)) {
TrieResult::NoValue | TrieResult::Intermediate(_) => {
self.next16(u16_tail(c as i32))
}
_ => TrieResult::NoMatch,
}
}
}
/// Traverses the trie from the current state for this input char.
///
/// # Examples
///
/// ```
/// use icu::collections::char16trie::{Char16Trie, TrieResult};
/// use zerovec::ZeroVec;
///
/// // A Char16Trie containing the ASCII characters 'a' and 'b'.
/// let trie_data = [48, 97, 176, 98, 32868];
/// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
///
/// let mut iter = trie.iter();
/// let res = iter.next16('a' as u16);
/// assert_eq!(res, TrieResult::Intermediate(1));
/// let res = iter.next16('b' as u16);
/// assert_eq!(res, TrieResult::FinalValue(100));
/// let res = iter.next16('c' as u16);
/// assert_eq!(res, TrieResult::NoMatch);
/// ```
pub fn next16(&mut self, c: u16) -> TrieResult {
let mut pos = match self.pos {
Some(p) => p,
None => return TrieResult::NoMatch,
};
if let Some(length) = self.remaining_match_length {
// Remaining part of a linear-match node
if c == trie_unwrap!(self.trie.get(pos)) {
pos += 1;
self.pos = Some(pos);
if length == 0 {
self.remaining_match_length = None;
let node = trie_unwrap!(self.trie.get(pos));
if node >= MIN_VALUE_LEAD {
return self.value_result(pos);
}
} else {
self.remaining_match_length = Some(length - 1);
}
return TrieResult::NoValue;
}
self.stop();
TrieResult::NoMatch
} else {
self.next_impl(pos, c)
}
}
fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult {
let mut pos = pos;
let mut length = length;
if length == 0 {
length = trie_unwrap!(self.trie.get(pos)) as usize;
pos += 1;
}
length += 1;
// The length of the branch is the number of units to select from.
// The data structure encodes a binary search.
while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH {
if in_unit < trie_unwrap!(self.trie.get(pos)) {
length >>= 1;
pos = trie_unwrap!(self.jump_by_delta(pos + 1));
} else {
length = length - (length >> 1);
pos = trie_unwrap!(self.skip_delta(pos + 1));
}
}
// Drop down to linear search for the last few bytes.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
loop {
if in_unit == trie_unwrap!(self.trie.get(pos)) {
pos += 1;
let mut node = trie_unwrap!(self.trie.get(pos));
if node & VALUE_IS_FINAL != 0 {
self.pos = Some(pos);
return self.value_result(pos);
}
// Use the non-final value as the jump delta.
pos += 1;
if node < MIN_TWO_UNIT_VALUE_LEAD {
pos += node as usize;
} else if node < THREE_UNIT_VALUE_LEAD {
pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize
| trie_unwrap!(self.trie.get(pos)) as usize;
pos += 1;
} else {
pos += ((trie_unwrap!(self.trie.get(pos)) as usize) << 16)
| trie_unwrap!(self.trie.get(pos + 1)) as usize;
pos += 2;
}
node = trie_unwrap!(self.trie.get(pos));
self.pos = Some(pos);
if node >= MIN_VALUE_LEAD {
return self.value_result(pos);
}
return TrieResult::NoValue;
}
length -= 1;
pos = trie_unwrap!(self.skip_value(pos + 1));
if length <= 1 {
break;
}
}
if in_unit == trie_unwrap!(self.trie.get(pos)) {
pos += 1;
self.pos = Some(pos);
let node = trie_unwrap!(self.trie.get(pos));
if node >= MIN_VALUE_LEAD {
return self.value_result(pos);
}
TrieResult::NoValue
} else {
self.stop();
TrieResult::NoMatch
}
}
fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult {
let mut node = trie_unwrap!(self.trie.get(pos));
let mut pos = pos + 1;
loop {
if node < MIN_LINEAR_MATCH {
return self.branch_next(pos, node as usize, in_unit);
} else if node < MIN_VALUE_LEAD {
// Match the first of length+1 units.
let length = node - MIN_LINEAR_MATCH;
if in_unit == trie_unwrap!(self.trie.get(pos)) {
pos += 1;
if length == 0 {
self.remaining_match_length = None;
self.pos = Some(pos);
node = trie_unwrap!(self.trie.get(pos));
if node >= MIN_VALUE_LEAD {
return self.value_result(pos);
}
return TrieResult::NoValue;
}
self.remaining_match_length = Some(length as usize - 1);
self.pos = Some(pos);
return TrieResult::NoValue;
}
// No match
break;
} else if (node & VALUE_IS_FINAL) != 0 {
// No further matching units.
break;
} else {
// Skip intermediate value.
pos = skip_node_value(pos, node);
node &= NODE_TYPE_MASK;
}
}
self.stop();
TrieResult::NoMatch
}
fn stop(&mut self) {
self.pos = None;
}
#[inline(always)] // 1 call site and we want the Option to go away
fn jump_by_delta(&self, pos: usize) -> Option<usize> {
let delta = self.trie.get(pos)?;
let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
// nothing to do
pos + 1 + delta as usize
} else if delta == THREE_UNIT_DELTA_LEAD {
let delta =
((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize);
pos + delta + 3
} else {
let delta = (((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16)
| (self.trie.get(pos + 1)? as usize);
pos + delta + 2
};
Some(v)
}
#[inline(always)] // 1 call site and we want the Option to go away
fn skip_value(&self, pos: usize) -> Option<usize> {
let lead_unit = self.trie.get(pos)?;
Some(skip_value(pos + 1, lead_unit & 0x7fff))
}
#[inline(always)] // 1 call site and we want the Option to go away
fn skip_delta(&self, pos: usize) -> Option<usize> {
let delta = self.trie.get(pos)?;
let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
pos + 1
} else if delta == THREE_UNIT_DELTA_LEAD {
pos + 3
} else {
pos + 2
};
Some(v)
}
fn value_result(&self, pos: usize) -> TrieResult {
match self.get_value(pos) {
Some(result) => result,
None => {
// Unexpected
debug_assert!(false);
TrieResult::NoMatch
}
}
}
#[inline(always)] // 1 call site and we want the Option to go away
fn get_value(&self, pos: usize) -> Option<TrieResult> {
let lead_unit = self.trie.get(pos)?;
if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL {
self.read_value(pos + 1, lead_unit & 0x7fff)
.map(TrieResult::FinalValue)
} else {
self.read_node_value(pos + 1, lead_unit)
.map(TrieResult::Intermediate)
}
}
#[inline(always)] // 1 call site and we want the Option to go away
fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD {
lead_unit.into()
} else if lead_unit < THREE_UNIT_VALUE_LEAD {
(((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16) | self.trie.get(pos)? as i32
} else {
((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
};
Some(v)
}
#[inline(always)] // 1 call site and we want the Option to go away
fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) {
((lead_unit >> 6) - 1).into()
} else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD {
((((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10)
| self.trie.get(pos)? as i32
} else {
((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
};
Some(v)
}
}

View File

@@ -0,0 +1,986 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use alloc::vec;
use alloc::vec::Vec;
use core::{char, cmp::Ordering, ops::RangeBounds};
use potential_utf::PotentialCodePoint;
use crate::codepointinvlist::{utils::deconstruct_range, CodePointInversionList};
use zerovec::{ule::AsULE, ZeroVec};
/// A builder for [`CodePointInversionList`].
///
/// Provides exposure to builder functions and conversion to [`CodePointInversionList`]
#[derive(Default)]
pub struct CodePointInversionListBuilder {
// A sorted list of even length, with values <= char::MAX + 1
intervals: Vec<u32>,
}
impl CodePointInversionListBuilder {
/// Returns empty [`CodePointInversionListBuilder`]
pub const fn new() -> Self {
Self { intervals: vec![] }
}
/// Returns a [`CodePointInversionList`] and consumes the [`CodePointInversionListBuilder`]
pub fn build(self) -> CodePointInversionList<'static> {
let inv_list: ZeroVec<PotentialCodePoint> = self
.intervals
.into_iter()
.map(PotentialCodePoint::from_u24)
.collect();
#[expect(clippy::unwrap_used)] // by invariant
CodePointInversionList::try_from_inversion_list(inv_list).unwrap()
}
/// Abstraction for adding/removing a range from start..end
///
/// If add is true add, else remove
fn add_remove_middle(&mut self, start: u32, end: u32, add: bool) {
if start >= end || end > char::MAX as u32 + 1 {
return;
}
let start_res = self.intervals.binary_search(&start);
let end_res = self.intervals.binary_search(&end);
let mut start_ind = start_res.unwrap_or_else(|x| x);
let mut end_ind = end_res.unwrap_or_else(|x| x);
let start_pos_check = (start_ind % 2 == 0) == add;
let end_pos_check = (end_ind % 2 == 0) == add;
let start_eq_end = start_ind == end_ind;
#[expect(clippy::indexing_slicing)] // all indices are binary search results
if start_eq_end && start_pos_check && end_res.is_err() {
self.intervals.splice(start_ind..end_ind, [start, end]);
} else {
if start_pos_check {
self.intervals[start_ind] = start;
start_ind += 1;
}
if end_pos_check {
if end_res.is_ok() {
end_ind += 1;
} else {
end_ind -= 1;
self.intervals[end_ind] = end;
}
}
if start_ind < end_ind {
self.intervals.drain(start_ind..end_ind);
}
}
}
/// Add the range to the [`CodePointInversionListBuilder`]
///
/// Accomplishes this through binary search for the start and end indices and merges intervals
/// in between with inplace memory. Performs `O(1)` operation if adding to end of list, and `O(N)` otherwise,
/// where `N` is the number of endpoints.
fn add(&mut self, start: u32, end: u32) {
if start >= end {
return;
}
if self.intervals.is_empty() {
self.intervals.extend_from_slice(&[start, end]);
return;
}
self.add_remove_middle(start, end, true);
}
/// Add the character to the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_char('a');
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('a'));
/// ```
pub fn add_char(&mut self, c: char) {
let to_add = c as u32;
self.add(to_add, to_add + 1);
}
/// Add the code point value to the [`CodePointInversionListBuilder`]
///
/// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
/// values, there is an important difference. A [`u32`] can take values up to
/// a very large integer value, while a [`prim@char`] in Rust is defined to be in
/// the range from 0 to the maximum valid Unicode Scalar Value.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add32(0x41);
/// let check = builder.build();
/// assert!(check.contains32(0x41));
/// ```
pub fn add32(&mut self, c: u32) {
if c <= char::MAX as u32 {
// we already know 0 <= c because c: u32
self.add(c, c + 1);
}
}
/// Add the range of characters to the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='Z');
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('A'));
/// ```
pub fn add_range(&mut self, range: impl RangeBounds<char>) {
let (start, end) = deconstruct_range(range);
self.add(start, end);
}
/// Add the range of characters, represented as u32, to the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range32(0xd800..=0xdfff);
/// let check = builder.build();
/// assert!(check.contains32(0xd900));
/// ```
pub fn add_range32(&mut self, range: impl RangeBounds<u32>) {
let (start, end) = deconstruct_range(range);
// Sets that include char::MAX need to allow an end value of MAX + 1
if start <= end && end <= char::MAX as u32 + 1 {
self.add(start, end);
}
}
/// Add the [`CodePointInversionList`] reference to the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::{
/// CodePointInversionList, CodePointInversionListBuilder,
/// };
/// let mut builder = CodePointInversionListBuilder::new();
/// let set = CodePointInversionList::try_from_u32_inversion_list_slice(&[
/// 0x41, 0x4C,
/// ])
/// .unwrap();
/// builder.add_set(&set);
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('A'));
/// ```
#[allow(unused_assignments)]
pub fn add_set(&mut self, set: &CodePointInversionList) {
#[expect(clippy::indexing_slicing)] // chunks
set.as_inversion_list()
.as_ule_slice()
.chunks(2)
.for_each(|pair| {
self.add(
u32::from(PotentialCodePoint::from_unaligned(pair[0])),
u32::from(PotentialCodePoint::from_unaligned(pair[1])),
)
});
}
/// Removes the range from the [`CodePointInversionListBuilder`]
///
/// Performs binary search to find start and end affected intervals, then removes in an `O(N)` fashion
/// where `N` is the number of endpoints, with in-place memory.
fn remove(&mut self, start: u32, end: u32) {
if start >= end || self.intervals.is_empty() {
return;
}
if let Some(&last) = self.intervals.last() {
#[expect(clippy::indexing_slicing)]
// by invariant, if we have a last we have a (different) first
if start <= self.intervals[0] && end >= last {
self.intervals.clear();
} else {
self.add_remove_middle(start, end, false);
}
}
}
/// Remove the character from the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='Z');
/// builder.remove_char('A');
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('B'));
pub fn remove_char(&mut self, c: char) {
self.remove32(c as u32)
}
/// See [`Self::remove_char`]
pub fn remove32(&mut self, c: u32) {
self.remove(c, c + 1);
}
/// Remove the range of characters from the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='Z');
/// builder.remove_range('A'..='C');
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('D'));
pub fn remove_range(&mut self, range: impl RangeBounds<char>) {
let (start, end) = deconstruct_range(range);
self.remove(start, end);
}
/// See [`Self::remove_range`]
pub fn remove_range32(&mut self, range: impl RangeBounds<u32>) {
let (start, end) = deconstruct_range(range);
self.remove(start, end);
}
/// Remove the [`CodePointInversionList`] from the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::{CodePointInversionList, CodePointInversionListBuilder};
/// let mut builder = CodePointInversionListBuilder::new();
/// let set = CodePointInversionList::try_from_u32_inversion_list_slice(&[0x41, 0x46]).unwrap();
/// builder.add_range('A'..='Z');
/// builder.remove_set(&set); // removes 'A'..='E'
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('F'));
#[expect(clippy::indexing_slicing)] // chunks
pub fn remove_set(&mut self, set: &CodePointInversionList) {
set.as_inversion_list()
.as_ule_slice()
.chunks(2)
.for_each(|pair| {
self.remove(
u32::from(PotentialCodePoint::from_unaligned(pair[0])),
u32::from(PotentialCodePoint::from_unaligned(pair[1])),
)
});
}
/// Retain the specified character in the [`CodePointInversionListBuilder`] if it exists
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='Z');
/// builder.retain_char('A');
/// let set = builder.build();
/// let mut check = set.iter_chars();
/// assert_eq!(check.next(), Some('A'));
/// assert_eq!(check.next(), None);
/// ```
pub fn retain_char(&mut self, c: char) {
self.retain32(c as u32)
}
/// See [`Self::retain_char`]
pub fn retain32(&mut self, c: u32) {
self.remove(0, c);
self.remove(c + 1, (char::MAX as u32) + 1);
}
/// Retain the range of characters located within the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='Z');
/// builder.retain_range('A'..='B');
/// let set = builder.build();
/// let mut check = set.iter_chars();
/// assert_eq!(check.next(), Some('A'));
/// assert_eq!(check.next(), Some('B'));
/// assert_eq!(check.next(), None);
/// ```
pub fn retain_range(&mut self, range: impl RangeBounds<char>) {
let (start, end) = deconstruct_range(range);
self.remove(0, start);
self.remove(end, (char::MAX as u32) + 1);
}
/// See [`Self::retain_range`]
pub fn retain_range32(&mut self, range: impl RangeBounds<u32>) {
let (start, end) = deconstruct_range(range);
self.remove(0, start);
self.remove(end, (char::MAX as u32) + 1);
}
/// Retain the elements in the specified set within the [`CodePointInversionListBuilder`]
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::{
/// CodePointInversionList, CodePointInversionListBuilder,
/// };
/// let mut builder = CodePointInversionListBuilder::new();
/// let set =
/// CodePointInversionList::try_from_u32_inversion_list_slice(&[65, 70])
/// .unwrap();
/// builder.add_range('A'..='Z');
/// builder.retain_set(&set); // retains 'A'..='E'
/// let check = builder.build();
/// assert!(check.contains('A'));
/// assert!(!check.contains('G'));
/// ```
#[expect(clippy::indexing_slicing)] // chunks
pub fn retain_set(&mut self, set: &CodePointInversionList) {
let mut prev = 0;
for pair in set.as_inversion_list().as_ule_slice().chunks(2) {
let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0]));
let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1]));
self.remove(prev, range_start);
prev = range_limit;
}
self.remove(prev, (char::MAX as u32) + 1);
}
/// Computes the complement of the argument, adding any elements that do not yet exist in the builder,
/// and removing any elements that already exist in the builder. See public functions for examples.
///
/// Performs in `O(B + S)`, where `B` is the number of endpoints in the Builder, and `S` is the number
/// of endpoints in the argument.
fn complement_list(&mut self, set_iter: impl IntoIterator<Item = u32>) {
let mut res: Vec<u32> = vec![]; // not the biggest fan of having to allocate new memory
let mut ai = self.intervals.iter();
let mut bi = set_iter.into_iter();
let mut a = ai.next();
let mut b = bi.next();
while let (Some(c), Some(d)) = (a, b) {
match c.cmp(&d) {
Ordering::Less => {
res.push(*c);
a = ai.next();
}
Ordering::Greater => {
res.push(d);
b = bi.next();
}
Ordering::Equal => {
a = ai.next();
b = bi.next();
}
}
}
if let Some(c) = a {
res.push(*c)
}
if let Some(d) = b {
res.push(d)
}
res.extend(ai);
res.extend(bi);
self.intervals = res;
}
/// Computes the complement of the builder, inverting the builder (any elements in the builder are removed,
/// while any elements not in the builder are added)
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::{
/// CodePointInversionList, CodePointInversionListBuilder,
/// };
/// let mut builder = CodePointInversionListBuilder::new();
/// let set = CodePointInversionList::try_from_u32_inversion_list_slice(&[
/// 0x0,
/// 0x41,
/// 0x46,
/// (std::char::MAX as u32) + 1,
/// ])
/// .unwrap();
/// builder.add_set(&set);
/// builder.complement();
/// let check = builder.build();
/// assert_eq!(check.iter_chars().next(), Some('A'));
/// ```
pub fn complement(&mut self) {
if !self.intervals.is_empty() {
#[expect(clippy::indexing_slicing)] // by invariant
if self.intervals[0] == 0 {
self.intervals.drain(0..1);
} else {
self.intervals.insert(0, 0);
}
if self.intervals.last() == Some(&(char::MAX as u32 + 1)) {
self.intervals.pop();
} else {
self.intervals.push(char::MAX as u32 + 1);
}
} else {
self.intervals
.extend_from_slice(&[0, (char::MAX as u32 + 1)]);
}
}
/// Complements the character in the builder, adding it if not in the builder, and removing it otherwise.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='D');
/// builder.complement_char('A');
/// builder.complement_char('E');
/// let check = builder.build();
/// assert!(check.contains('E'));
/// assert!(!check.contains('A'));
/// ```
pub fn complement_char(&mut self, c: char) {
self.complement32(c as u32);
}
/// See [`Self::complement_char`]
pub fn complement32(&mut self, c: u32) {
self.complement_list([c, c + 1]);
}
/// Complements the range in the builder, adding any elements in the range if not in the builder, and
/// removing them otherwise.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// builder.add_range('A'..='D');
/// builder.complement_range('C'..='F');
/// let check = builder.build();
/// assert!(check.contains('F'));
/// assert!(!check.contains('C'));
/// ```
pub fn complement_range(&mut self, range: impl RangeBounds<char>) {
let (start, end) = deconstruct_range(range);
self.complement_list([start, end]);
}
/// See [`Self::complement_range`]
pub fn complement_range32(&mut self, range: impl RangeBounds<u32>) {
let (start, end) = deconstruct_range(range);
self.complement_list([start, end]);
}
/// Complements the set in the builder, adding any elements in the set if not in the builder, and
/// removing them otherwise.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::{
/// CodePointInversionList, CodePointInversionListBuilder,
/// };
/// let mut builder = CodePointInversionListBuilder::new();
/// let set = CodePointInversionList::try_from_u32_inversion_list_slice(&[
/// 0x41, 0x46, 0x4B, 0x5A,
/// ])
/// .unwrap();
/// builder.add_range('C'..='N'); // 67 - 78
/// builder.complement_set(&set);
/// let check = builder.build();
/// assert!(check.contains('Q')); // 81
/// assert!(!check.contains('N')); // 78
/// ```
pub fn complement_set(&mut self, set: &CodePointInversionList) {
let inv_list_iter_owned = set.as_inversion_list().iter().map(u32::from);
self.complement_list(inv_list_iter_owned);
}
/// Returns whether the build is empty.
///
/// # Examples
///
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionListBuilder;
/// let mut builder = CodePointInversionListBuilder::new();
/// let check = builder.build();
/// assert!(check.is_empty());
/// ```
pub fn is_empty(&self) -> bool {
self.intervals.is_empty()
}
}
#[cfg(test)]
mod tests {
use super::{CodePointInversionList, CodePointInversionListBuilder};
use core::char;
fn generate_tester(ex: &[u32]) -> CodePointInversionListBuilder {
let check = CodePointInversionList::try_from_u32_inversion_list_slice(ex).unwrap();
let mut builder = CodePointInversionListBuilder::new();
builder.add_set(&check);
builder
}
#[test]
fn test_new() {
let ex = CodePointInversionListBuilder::new();
assert!(ex.intervals.is_empty());
}
#[test]
fn test_build() {
let mut builder = CodePointInversionListBuilder::new();
builder.add(0x41, 0x42);
let check: CodePointInversionList = builder.build();
assert_eq!(check.iter_chars().next(), Some('A'));
}
#[test]
fn test_empty_build() {
let builder = CodePointInversionListBuilder::new();
let check: CodePointInversionList = builder.build();
assert!(check.is_empty());
}
#[test]
fn test_add_to_empty() {
let mut builder = CodePointInversionListBuilder::new();
builder.add(0x0, 0xA);
assert_eq!(builder.intervals, [0x0, 0xA]);
}
#[test]
fn test_add_invalid() {
let mut builder = CodePointInversionListBuilder::new();
builder.add(0x0, 0x0);
builder.add(0x5, 0x0);
assert!(builder.intervals.is_empty());
}
#[test]
fn test_add_to_start() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x0, 0x5);
let expected = [0x0, 0x5, 0xA, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_start_overlap() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x0, 0xE);
let expected = [0x0, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_end() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x3C, 0x46);
let expected = [0xA, 0x14, 0x28, 0x32, 60, 70];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_end_overlap() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x2B, 0x46);
let expected = [0xA, 0x14, 0x28, 0x46];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_middle_no_overlap() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x19, 0x1B);
let expected = [0xA, 0x14, 0x19, 0x1B, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_middle_inside() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0xA, 0x14);
let expected = [0xA, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_middle_left_overlap() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0xF, 0x19);
let expected = [0xA, 0x19, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_middle_right_overlap() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x1E, 0x28);
let expected = [0xA, 0x14, 0x1E, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_full_encompass() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x0, 0x3C);
let expected = [0x0, 0x3C];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_to_partial_encompass() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x0, 0x23);
let expected = [0x0, 0x23, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_front() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(5, 10);
let expected = [5, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_back() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x32, 0x37);
let expected = [0xA, 0x14, 0x28, 0x37];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_start_middle() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x14, 0x19);
let expected = [0xA, 0x19, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_end_middle() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x23, 0x28);
let expected = [0xA, 0x14, 0x23, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_in_between_end() {
let mut builder = generate_tester(&[0xA, 0x14, 0x1E, 0x28, 0x32, 0x3C]);
builder.add(0xF, 0x1E);
let expected = [0xA, 0x28, 0x32, 0x3C];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_aligned_in_between_start() {
let mut builder = generate_tester(&[0xA, 0x14, 0x1E, 0x28, 0x32, 0x3C]);
builder.add(20, 35);
let expected = [0xA, 0x28, 0x32, 0x3C];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_adjacent_ranges() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.add(0x13, 0x14);
builder.add(0x14, 0x15);
builder.add(0x15, 0x16);
let expected = [0xA, 0x16, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_codepointinversionlist() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
let check = CodePointInversionList::try_from_u32_inversion_list_slice(&[
0x5, 0xA, 0x16, 0x21, 0x2C, 0x33,
])
.unwrap();
builder.add_set(&check);
let expected = [0x5, 0x14, 0x16, 0x21, 0x28, 0x33];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_char() {
let mut builder = CodePointInversionListBuilder::new();
builder.add_char('a');
let expected = [0x61, 0x62];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_range() {
let mut builder = CodePointInversionListBuilder::new();
builder.add_range('A'..='Z');
let expected = [0x41, 0x5B];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_range32() {
let mut builder = CodePointInversionListBuilder::new();
builder.add_range32(0xd800..=0xdfff);
let expected = [0xd800, 0xe000];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_add_invalid_range() {
let mut builder = CodePointInversionListBuilder::new();
builder.add_range('Z'..='A');
assert!(builder.intervals.is_empty());
}
#[test]
fn test_remove_empty() {
let mut builder = CodePointInversionListBuilder::new();
builder.remove(0x0, 0xA);
assert!(builder.intervals.is_empty());
}
#[test]
fn test_remove_entire_builder() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0xA, 0x32);
assert!(builder.intervals.is_empty());
}
#[test]
fn test_remove_entire_range() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0xA, 0x14);
let expected = [0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_partial_range_left() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0xA, 0x2B);
let expected = [0x2B, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_ne_range() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0x14, 0x28);
let expected = [0xA, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_partial_range_right() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0xF, 0x37);
let expected = [0xA, 0xF];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_middle_range() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0xC, 0x12);
let expected = [0xA, 0xC, 0x12, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_ne_middle_range() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0x19, 0x1B);
let expected = [0xA, 0x14, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_encompassed_range() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32, 70, 80]);
builder.remove(0x19, 0x37);
let expected = [0xA, 0x14, 0x46, 0x50];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_adjacent_ranges() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.remove(0x27, 0x28);
builder.remove(0x28, 0x29);
builder.remove(0x29, 0x2A);
let expected = [0xA, 0x14, 0x2A, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_char() {
let mut builder = generate_tester(&[0x41, 0x46]);
builder.remove_char('A'); // 65
let expected = [0x42, 0x46];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_range() {
let mut builder = generate_tester(&[0x41, 0x5A]);
builder.remove_range('A'..'L'); // 65 - 76
let expected = [0x4C, 0x5A];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_remove_set() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32, 70, 80]);
let remove =
CodePointInversionList::try_from_u32_inversion_list_slice(&[0xA, 0x14, 0x2D, 0x4B])
.unwrap();
builder.remove_set(&remove);
let expected = [0x28, 0x2D, 0x4B, 0x50];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_retain_char() {
let mut builder = generate_tester(&[0x41, 0x5A]);
builder.retain_char('A'); // 65
let expected = [0x41, 0x42];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_retain_range() {
let mut builder = generate_tester(&[0x41, 0x5A]);
builder.retain_range('C'..'F'); // 67 - 70
let expected = [0x43, 0x46];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_retain_range_empty() {
let mut builder = generate_tester(&[0x41, 0x46]);
builder.retain_range('F'..'Z');
assert!(builder.intervals.is_empty());
}
#[test]
fn test_retain_set() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32, 70, 80]);
let retain = CodePointInversionList::try_from_u32_inversion_list_slice(&[
0xE, 0x14, 0x19, 0x37, 0x4D, 0x51,
])
.unwrap();
builder.retain_set(&retain);
let expected = [0xE, 0x14, 0x28, 0x32, 0x4D, 0x50];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.complement();
let expected = [0x0, 0xA, 0x14, 0x28, 0x32, (char::MAX as u32) + 1];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_empty() {
let mut builder = generate_tester(&[]);
builder.complement();
let expected = [0x0, (char::MAX as u32) + 1];
assert_eq!(builder.intervals, expected);
builder.complement();
let expected: [u32; 0] = [];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_zero_max() {
let mut builder = generate_tester(&[0x0, 0xA, 0x5A, (char::MAX as u32) + 1]);
builder.complement();
let expected = [0xA, 0x5A];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_interior() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.complement_list([0xE, 0x14]);
let expected = [0xA, 0xE, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_exterior() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.complement_list([0x19, 0x23]);
let expected = [0xA, 0x14, 0x19, 0x23, 0x28, 0x32];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_larger_list() {
let mut builder = generate_tester(&[0xA, 0x14, 0x28, 0x32]);
builder.complement_list([0x1E, 0x37, 0x3C, 0x46]);
let expected = [0xA, 0x14, 0x1E, 0x28, 0x32, 0x37, 0x3C, 0x46];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_char() {
let mut builder = generate_tester(&[0x41, 0x4C]); // A - K
builder.complement_char('A');
builder.complement_char('L');
let expected = [0x42, 0x4D];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_range() {
let mut builder = generate_tester(&[0x46, 0x4C]); // F - K
builder.complement_range('A'..='Z');
let expected = [0x41, 0x46, 0x4C, 0x5B];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_complement_set() {
let mut builder = generate_tester(&[0x43, 0x4E]);
let set =
CodePointInversionList::try_from_u32_inversion_list_slice(&[0x41, 0x46, 0x4B, 0x5A])
.unwrap();
builder.complement_set(&set);
let expected = [0x41, 0x43, 0x46, 0x4B, 0x4E, 0x5A];
assert_eq!(builder.intervals, expected);
}
#[test]
fn test_is_empty() {
let builder = CodePointInversionListBuilder::new();
assert!(builder.is_empty());
}
}

View File

@@ -0,0 +1,180 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::iter::FromIterator;
use core::{
convert::TryFrom,
ops::{Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive},
};
use super::RangeError;
use crate::codepointinvlist::utils::deconstruct_range;
use crate::codepointinvlist::CodePointInversionList;
use crate::codepointinvlist::CodePointInversionListBuilder;
use potential_utf::PotentialCodePoint;
use zerovec::ZeroVec;
fn try_from_range<'data>(
range: impl RangeBounds<char>,
) -> Result<CodePointInversionList<'data>, RangeError> {
let (from, till) = deconstruct_range(range);
if from < till {
let set = [
PotentialCodePoint::from_u24(from),
PotentialCodePoint::from_u24(till),
];
let inv_list: ZeroVec<PotentialCodePoint> = ZeroVec::alloc_from_slice(&set);
#[expect(clippy::unwrap_used)] // valid
Ok(CodePointInversionList::try_from_inversion_list(inv_list).unwrap())
} else {
Err(RangeError(from, till))
}
}
impl TryFrom<Range<char>> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(range: Range<char>) -> Result<Self, Self::Error> {
try_from_range(range)
}
}
impl TryFrom<RangeFrom<char>> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(range: RangeFrom<char>) -> Result<Self, Self::Error> {
try_from_range(range)
}
}
impl TryFrom<RangeFull> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(_: RangeFull) -> Result<Self, Self::Error> {
Ok(Self::all())
}
}
impl TryFrom<RangeInclusive<char>> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(range: RangeInclusive<char>) -> Result<Self, Self::Error> {
try_from_range(range)
}
}
impl TryFrom<RangeTo<char>> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(range: RangeTo<char>) -> Result<Self, Self::Error> {
try_from_range(range)
}
}
impl TryFrom<RangeToInclusive<char>> for CodePointInversionList<'_> {
type Error = RangeError;
fn try_from(range: RangeToInclusive<char>) -> Result<Self, Self::Error> {
try_from_range(range)
}
}
impl FromIterator<RangeInclusive<u32>> for CodePointInversionList<'_> {
fn from_iter<I: IntoIterator<Item = RangeInclusive<u32>>>(iter: I) -> Self {
let mut builder = CodePointInversionListBuilder::new();
for range in iter {
builder.add_range32(range);
}
builder.build()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::codepointinvlist::CodePointInversionList;
use core::{char, convert::TryFrom};
#[test]
fn test_try_from_range() {
let check: Vec<char> = CodePointInversionList::try_from('A'..'B')
.unwrap()
.iter_chars()
.collect();
assert_eq!(vec!['A'], check);
}
#[test]
fn test_try_from_range_error() {
let check = CodePointInversionList::try_from('A'..'A');
assert!(matches!(check, Err(RangeError(65, 65))));
}
#[test]
fn test_try_from_range_inclusive() {
let check: Vec<char> = CodePointInversionList::try_from('A'..='A')
.unwrap()
.iter_chars()
.collect();
assert_eq!(vec!['A'], check);
}
#[test]
fn test_try_from_range_inclusive_err() {
let check = CodePointInversionList::try_from('B'..'A');
assert!(matches!(check, Err(RangeError(66, 65))));
}
#[test]
fn test_try_from_range_from() {
let uset = CodePointInversionList::try_from('A'..).unwrap();
let check: usize = uset.size();
let expected: usize = (char::MAX as usize) + 1 - 65;
assert_eq!(expected, check);
}
#[test]
fn test_try_from_range_to() {
let uset = CodePointInversionList::try_from(..'A').unwrap();
let check: usize = uset.size();
let expected: usize = 65;
assert_eq!(expected, check);
}
#[test]
fn test_try_from_range_to_err() {
let check = CodePointInversionList::try_from(..(0x0 as char));
assert!(matches!(check, Err(RangeError(0, 0))));
}
#[test]
fn test_try_from_range_to_inclusive() {
let uset = CodePointInversionList::try_from(..='A').unwrap();
let check: usize = uset.size();
let expected: usize = 66;
assert_eq!(expected, check);
}
#[test]
fn test_try_from_range_full() {
let uset = CodePointInversionList::try_from(..).unwrap();
let check: usize = uset.size();
let expected: usize = (char::MAX as usize) + 1;
assert_eq!(expected, check);
}
#[test]
fn test_from_range_iterator() {
let ranges = [
0..=0x3FFF,
0x4000..=0x7FFF,
0x8000..=0xBFFF,
0xC000..=0xFFFF,
];
let expected =
CodePointInversionList::try_from_u32_inversion_list_slice(&[0x0, 0x1_0000]).unwrap();
let actual = CodePointInversionList::from_iter(ranges);
assert_eq!(expected, actual);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,80 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module provides necessary functionality for highly efficient querying of sets of Unicode characters.
//!
//! It is an implementation of the code point portion of the existing
//! [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
//!
//! # Architecture
//! ICU4X [`CodePointInversionList`] is split up into independent levels, with [`CodePointInversionList`] representing the membership/query API,
//! and [`CodePointInversionListBuilder`] representing the builder API.
//!
//! # Examples:
//!
//! ## Creating a `CodePointInversionList`
//!
//! `CodePointSets` are created from either serialized [`CodePointSets`](CodePointInversionList),
//! represented by [inversion lists](http://userguide.icu-project.org/strings/properties),
//! the [`CodePointInversionListBuilder`], or from the Properties API.
//!
//! ```
//! use icu::collections::codepointinvlist::{
//! CodePointInversionList, CodePointInversionListBuilder,
//! };
//!
//! let mut builder = CodePointInversionListBuilder::new();
//! builder.add_range('A'..='Z');
//! let set: CodePointInversionList = builder.build();
//!
//! assert!(set.contains('A'));
//! ```
//!
//! ## Querying a `CodePointInversionList`
//!
//! Currently, you can check if a character/range of characters exists in the [`CodePointInversionList`], or iterate through the characters.
//!
//! ```
//! use icu::collections::codepointinvlist::{
//! CodePointInversionList, CodePointInversionListBuilder,
//! };
//!
//! let mut builder = CodePointInversionListBuilder::new();
//! builder.add_range('A'..='Z');
//! let set: CodePointInversionList = builder.build();
//!
//! assert!(set.contains('A'));
//! assert!(set.contains_range('A'..='C'));
//! assert_eq!(set.iter_chars().next(), Some('A'));
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
#[macro_use]
mod builder;
#[cfg(feature = "alloc")]
mod conversions;
mod cpinvlist;
mod utils;
#[cfg(feature = "alloc")]
pub use builder::CodePointInversionListBuilder;
pub use cpinvlist::CodePointInversionList;
pub use cpinvlist::CodePointInversionListULE;
use displaydoc::Display;
#[derive(Display, Debug)]
/// A CodePointInversionList was constructed with an invalid inversion list
#[cfg_attr(feature = "alloc", displaydoc("Invalid set: {0:?}"))]
pub struct InvalidSetError(
#[cfg(feature = "alloc")] pub alloc::vec::Vec<potential_utf::PotentialCodePoint>,
);
/// A CodePointInversionList was constructed from an invalid range
#[derive(Display, Debug)]
#[displaydoc("Invalid range: {0}..{1}")]
pub struct RangeError(pub u32, pub u32);

View File

@@ -0,0 +1,118 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::{
char,
ops::{Bound::*, RangeBounds},
};
use potential_utf::PotentialCodePoint;
use zerovec::ule::AsULE;
use zerovec::ZeroVec;
/// Returns whether the vector is sorted ascending non inclusive, of even length,
/// and within the bounds of `0x0 -> 0x10FFFF + 1` inclusive.
#[expect(clippy::indexing_slicing)] // windows
#[expect(clippy::unwrap_used)] // by is_empty check
pub fn is_valid_zv(inv_list_zv: &ZeroVec<'_, PotentialCodePoint>) -> bool {
inv_list_zv.is_empty()
|| (inv_list_zv.len() % 2 == 0
&& inv_list_zv.as_ule_slice().windows(2).all(|chunk| {
<PotentialCodePoint as AsULE>::from_unaligned(chunk[0])
< <PotentialCodePoint as AsULE>::from_unaligned(chunk[1])
})
&& u32::from(inv_list_zv.last().unwrap()) <= char::MAX as u32 + 1)
}
/// Returns start (inclusive) and end (exclusive) bounds of [`RangeBounds`]
pub fn deconstruct_range<T>(range: impl RangeBounds<T>) -> (u32, u32)
where
T: Into<u32> + Copy,
{
let from = match range.start_bound() {
Included(b) => (*b).into(),
Excluded(_) => unreachable!(),
Unbounded => 0,
};
let till = match range.end_bound() {
Included(b) => (*b).into() + 1,
Excluded(b) => (*b).into(),
Unbounded => (char::MAX as u32) + 1,
};
(from, till)
}
#[cfg(test)]
mod tests {
use super::{deconstruct_range, is_valid_zv, PotentialCodePoint};
use core::char;
use zerovec::ZeroVec;
fn make_zv(slice: &[u32]) -> ZeroVec<'_, PotentialCodePoint> {
slice
.iter()
.copied()
.map(PotentialCodePoint::from_u24)
.collect()
}
#[test]
fn test_is_valid_zv() {
let check = make_zv(&[0x2, 0x3, 0x4, 0x5]);
assert!(is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_empty() {
let check = make_zv(&[]);
assert!(is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_overlapping() {
let check = make_zv(&[0x2, 0x5, 0x4, 0x6]);
assert!(!is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_out_of_order() {
let check = make_zv(&[0x5, 0x4, 0x5, 0x6, 0x7]);
assert!(!is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_duplicate() {
let check = make_zv(&[0x1, 0x2, 0x3, 0x3, 0x5]);
assert!(!is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_odd() {
let check = make_zv(&[0x1, 0x2, 0x3, 0x4, 0x5]);
assert!(!is_valid_zv(&check));
}
#[test]
fn test_is_valid_zv_out_of_range() {
let check = make_zv(&[0x1, 0x2, 0x3, 0x4, (char::MAX as u32) + 1]);
assert!(!is_valid_zv(&check));
}
// deconstruct_range
#[test]
fn test_deconstruct_range() {
let expected = (0x41, 0x45);
let check = deconstruct_range('A'..'E'); // Range
assert_eq!(check, expected);
let check = deconstruct_range('A'..='D'); // Range Inclusive
assert_eq!(check, expected);
let check = deconstruct_range('A'..); // Range From
assert_eq!(check, (0x41, (char::MAX as u32) + 1));
let check = deconstruct_range(..'A'); // Range To
assert_eq!(check, (0x0, 0x41));
let check = deconstruct_range(..='A'); // Range To Inclusive
assert_eq!(check, (0x0, 0x42));
let check = deconstruct_range::<char>(..); // Range Full
assert_eq!(check, (0x0, (char::MAX as u32) + 1));
}
}

View File

@@ -0,0 +1,385 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module provides functionality for querying of sets of Unicode code points and strings.
//!
//! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
//! it also maintains a list of strings in the set.
//!
//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
#[cfg(feature = "alloc")]
use crate::codepointinvlist::CodePointInversionListBuilder;
use crate::codepointinvlist::{CodePointInversionList, CodePointInversionListULE};
#[cfg(feature = "alloc")]
use alloc::string::{String, ToString};
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use displaydoc::Display;
use yoke::Yokeable;
use zerofrom::ZeroFrom;
use zerovec::{VarZeroSlice, VarZeroVec};
/// A data structure providing a concrete implementation of a set of code points and strings,
/// using an inversion list for the code points.
///
/// This is what ICU4C calls a `UnicodeSet`.
#[zerovec::make_varule(CodePointInversionListAndStringListULE)]
#[zerovec::skip_derive(Ord)]
#[zerovec::derive(Debug)]
#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
#[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
// Valid to auto-derive Deserialize because the invariants are weakly held
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
pub struct CodePointInversionListAndStringList<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
#[zerovec::varule(CodePointInversionListULE)]
cp_inv_list: CodePointInversionList<'data>,
// Invariants (weakly held):
// - no input string is length 1 (a length 1 string should be a single code point)
// - the string list is sorted
// - the elements in the string list are unique
#[cfg_attr(feature = "serde", serde(borrow))]
str_list: VarZeroVec<'data, str>,
}
#[cfg(feature = "databake")]
impl databake::Bake for CodePointInversionListAndStringList<'_> {
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
env.insert("icu_collections");
let cp_inv_list = self.cp_inv_list.bake(env);
let str_list = self.str_list.bake(env);
// Safe because our parts are safe.
databake::quote! {
icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
}
}
}
#[cfg(feature = "databake")]
impl databake::BakeSize for CodePointInversionListAndStringList<'_> {
fn borrows_size(&self) -> usize {
self.cp_inv_list.borrows_size() + self.str_list.borrows_size()
}
}
impl<'data> CodePointInversionListAndStringList<'data> {
/// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
/// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
pub fn try_from(
cp_inv_list: CodePointInversionList<'data>,
str_list: VarZeroVec<'data, str>,
) -> Result<Self, InvalidStringList> {
// Verify invariants:
// Do so by using the equivalent of str_list.iter().windows(2) to get
// overlapping windows of size 2. The above putative code is not possible
// because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
// because the non-fixed size elements necessitate at least some type
// of allocation.
{
let mut it = str_list.iter();
if let Some(mut x) = it.next() {
if x.len() == 1 {
return Err(InvalidStringList::InvalidStringLength(
#[cfg(feature = "alloc")]
x.to_string(),
));
}
for y in it {
if x.len() == 1 {
return Err(InvalidStringList::InvalidStringLength(
#[cfg(feature = "alloc")]
x.to_string(),
));
} else if x == y {
return Err(InvalidStringList::StringListNotUnique(
#[cfg(feature = "alloc")]
x.to_string(),
));
} else if x > y {
return Err(InvalidStringList::StringListNotSorted(
#[cfg(feature = "alloc")]
x.to_string(),
#[cfg(feature = "alloc")]
y.to_string(),
));
}
// Next window begins. Update `x` here, `y` will be updated in next loop iteration.
x = y;
}
}
}
Ok(CodePointInversionListAndStringList {
cp_inv_list,
str_list,
})
}
#[doc(hidden)] // databake internal
pub const fn from_parts_unchecked(
cp_inv_list: CodePointInversionList<'data>,
str_list: VarZeroVec<'data, str>,
) -> Self {
CodePointInversionListAndStringList {
cp_inv_list,
str_list,
}
}
/// Returns the number of elements in this set (its cardinality).
/// Note than the elements of a set may include both individual
/// codepoints and strings.
pub fn size(&self) -> usize {
self.cp_inv_list.size() + self.str_list.len()
}
/// Return true if this set contains multi-code point strings or the empty string.
pub fn has_strings(&self) -> bool {
!self.str_list.is_empty()
}
///
/// # Examples
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
/// use zerovec::VarZeroVec;
///
/// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
/// let cp_list =
/// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
/// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
/// let str_list = VarZeroVec::<str>::from(str_slice);
///
/// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
///
/// assert!(cpilsl.contains_str("bmp_max"));
/// assert!(cpilsl.contains_str(""));
/// assert!(cpilsl.contains_str("A"));
/// assert!(cpilsl.contains_str("ቔ")); // U+1254 ETHIOPIC SYLLABLE QHEE
/// assert!(!cpilsl.contains_str("bazinga!"));
/// ```
pub fn contains_str(&self, s: &str) -> bool {
let mut chars = s.chars();
if let Some(first_char) = chars.next() {
if chars.next().is_none() {
return self.contains(first_char);
}
}
self.str_list.binary_search(s).is_ok()
}
///
/// # Examples
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
/// use zerovec::VarZeroVec;
///
/// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
/// let cp_list =
/// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
/// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
/// let str_list = VarZeroVec::<str>::from(str_slice);
///
/// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
///
/// assert!(cpilsl.contains32(0));
/// assert!(cpilsl.contains32(0x0042));
/// assert!(!cpilsl.contains32(0x0080));
/// ```
pub fn contains32(&self, cp: u32) -> bool {
self.cp_inv_list.contains32(cp)
}
///
/// # Examples
/// ```
/// use icu::collections::codepointinvlist::CodePointInversionList;
/// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
/// use zerovec::VarZeroVec;
///
/// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
/// let cp_list =
/// CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
/// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
/// let str_list = VarZeroVec::<str>::from(str_slice);
///
/// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
///
/// assert!(cpilsl.contains('A'));
/// assert!(cpilsl.contains('ቔ')); // U+1254 ETHIOPIC SYLLABLE QHEE
/// assert!(!cpilsl.contains('\u{1_0000}'));
/// assert!(!cpilsl.contains('🨫')); // U+1FA2B NEUTRAL CHESS TURNED QUEEN
pub fn contains(&self, ch: char) -> bool {
self.contains32(ch as u32)
}
/// Access the underlying [`CodePointInversionList`].
pub fn code_points(&self) -> &CodePointInversionList<'data> {
&self.cp_inv_list
}
/// Access the contained strings.
pub fn strings(&self) -> &VarZeroSlice<str> {
&self.str_list
}
}
#[cfg(feature = "alloc")]
/// ✨ *Enabled with the `alloc` Cargo feature.*
impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
fn from_iter<I>(it: I) -> Self
where
I: IntoIterator<Item = &'a str>,
{
let mut builder = CodePointInversionListBuilder::new();
let mut strings = Vec::<&str>::new();
for s in it {
let mut chars = s.chars();
if let Some(first_char) = chars.next() {
if chars.next().is_none() {
builder.add_char(first_char);
continue;
}
}
strings.push(s);
}
// Ensure that the string list is sorted. If not, the binary search that
// is used for `.contains(&str)` will return garbage output.
strings.sort_unstable();
strings.dedup();
let cp_inv_list = builder.build();
let str_list = VarZeroVec::<str>::from(&strings);
CodePointInversionListAndStringList {
cp_inv_list,
str_list,
}
}
}
/// Custom Errors for [`CodePointInversionListAndStringList`].
#[derive(Display, Debug)]
pub enum InvalidStringList {
/// A string in the string list had an invalid length
#[cfg_attr(feature = "alloc", displaydoc("Invalid string length for string: {0}"))]
InvalidStringLength(#[cfg(feature = "alloc")] String),
/// A string in the string list appears more than once
#[cfg_attr(feature = "alloc", displaydoc("String list has duplicate: {0}"))]
StringListNotUnique(#[cfg(feature = "alloc")] String),
/// Two strings in the string list compare to each other opposite of sorted order
#[cfg_attr(
feature = "alloc",
displaydoc("Strings in string list not in sorted order: ({0}, {1})")
)]
StringListNotSorted(
#[cfg(feature = "alloc")] String,
#[cfg(feature = "alloc")] String,
),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_size_has_strings() {
let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
let str_list = VarZeroVec::<str>::from(str_slice);
let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
assert!(cpilsl.has_strings());
assert_eq!(8, cpilsl.size());
}
#[test]
fn test_empty_string_allowed() {
let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
let str_list = VarZeroVec::<str>::from(str_slice);
let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
assert!(cpilsl.has_strings());
assert_eq!(9, cpilsl.size());
}
#[test]
fn test_invalid_string() {
let cp_slice = &[0, 1];
let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
let str_slice = &["a"];
let str_list = VarZeroVec::<str>::from(str_slice);
let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
assert!(matches!(
cpilsl,
Err(InvalidStringList::InvalidStringLength(_))
));
}
#[test]
fn test_invalid_string_list_has_duplicate() {
let cp_slice = &[0, 1];
let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
let str_slice = &["abc", "abc"];
let str_list = VarZeroVec::<str>::from(str_slice);
let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
assert!(matches!(
cpilsl,
Err(InvalidStringList::StringListNotUnique(_))
));
}
#[test]
fn test_invalid_string_list_not_sorted() {
let cp_slice = &[0, 1];
let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
let str_slice = &["xyz", "abc"];
let str_list = VarZeroVec::<str>::from(str_slice);
let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
assert!(matches!(
cpilsl,
Err(InvalidStringList::StringListNotSorted(_, _))
));
}
#[test]
fn test_from_iter_invariants() {
let in_strs_1 = ["a", "abc", "xyz", "abc"];
let in_strs_2 = ["xyz", "abc", "a", "abc"];
let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
assert_eq!(cpilsl_1, cpilsl_2);
assert!(cpilsl_1.has_strings());
assert!(cpilsl_1.contains_str("abc"));
assert!(cpilsl_1.contains_str("xyz"));
assert!(!cpilsl_1.contains_str("def"));
assert_eq!(1, cpilsl_1.cp_inv_list.size());
assert!(cpilsl_1.contains('a'));
assert!(!cpilsl_1.contains('0'));
assert!(!cpilsl_1.contains('q'));
assert_eq!(3, cpilsl_1.size());
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Custom error type(s) for the parent module.
use displaydoc::Display;
/// A custom error type for [`CodePointTrie`](super::CodePointTrie).
#[derive(Copy, Clone, Display, Debug, PartialEq)]
#[non_exhaustive]
pub enum Error {
/// Could not construct [`CodePointTrie`](super::CodePointTrie) from deserialized values
#[displaydoc("Could not construct CodePointTrie from deserialized values: {reason}")]
FromDeserialized {
/// Reason for inability to deserialize values.
reason: &'static str,
},
/// [`CodePointTrie`](super::CodePointTrie) must be constructed from data vector with at least one element
#[displaydoc("CodePointTrie must be constructed from data vector with at least one element")]
EmptyDataVector,
/// [`CodePointTrie`](super::CodePointTrie) must be constructed from index vector long enough to accommodate fast-path access
#[displaydoc("CodePointTrie must be constructed from index vector long enough to accommodate fast-path access")]
IndexTooShortForFastAccess,
/// [`CodePointTrie`](super::CodePointTrie) must be constructed from data vector long enough to accommodate fast-path access
#[displaydoc("CodePointTrie must be constructed from data vector long enough to accommodate fast-path access")]
DataTooShortForFastAccess,
}
impl core::error::Error for Error {}

View File

@@ -0,0 +1,76 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub const FAST_TYPE_SHIFT: i32 = 6;
/// Number of entries in a data block for code points below the fast limit. 64=0x40
pub const FAST_TYPE_DATA_BLOCK_LENGTH: u32 = 1 << FAST_TYPE_SHIFT;
/// Mask for getting the lower bits for the in-fast-data-block offset.
pub const FAST_TYPE_DATA_MASK: u32 = FAST_TYPE_DATA_BLOCK_LENGTH - 1;
/// Fast indexing limit for "fast"-type trie
pub const FAST_TYPE_FAST_INDEXING_MAX: u32 = 0xffff;
/// Fast indexing limit for "small"-type trie
pub const SMALL_TYPE_FAST_INDEXING_MAX: u32 = 0xfff;
/// Offset from dataLength (to be subtracted) for fetching the
/// value returned for out-of-range code points and ill-formed UTF-8/16.
pub const ERROR_VALUE_NEG_DATA_OFFSET: u32 = 1;
/// Offset from dataLength (to be subtracted) for fetching the
/// value returned for code points highStart..U+10FFFF.
pub const HIGH_VALUE_NEG_DATA_OFFSET: u32 = 2;
/// The length of the BMP index table. 1024=0x400
pub const BMP_INDEX_LENGTH: u32 = 0x10000 >> FAST_TYPE_SHIFT;
pub const SMALL_LIMIT: u32 = 0x1000;
pub const SMALL_INDEX_LENGTH: u32 = SMALL_LIMIT >> FAST_TYPE_SHIFT;
/// Shift size for getting the index-3 table offset.
pub const SHIFT_3: u32 = 4;
/// Shift size for getting the index-2 table offset.
pub const SHIFT_2: u32 = 5 + SHIFT_3;
/// Shift size for getting the index-1 table offset.
pub const SHIFT_1: u32 = 5 + SHIFT_2;
/// Difference between two shift sizes,
/// for getting an index-2 offset from an index-3 offset. 5=9-4
pub const SHIFT_2_3: u32 = SHIFT_2 - SHIFT_3;
/// Difference between two shift sizes,
/// for getting an index-1 offset from an index-2 offset. 5=14-9
pub const SHIFT_1_2: u32 = SHIFT_1 - SHIFT_2;
/// Number of index-1 entries for the BMP. (4)
/// This part of the index-1 table is omitted from the serialized form.
pub const OMITTED_BMP_INDEX_1_LENGTH: u32 = 0x10000 >> SHIFT_1;
/// Number of entries in an index-2 block. 32=0x20
pub const INDEX_2_BLOCK_LENGTH: u32 = 1 << SHIFT_1_2;
/// Mask for getting the lower bits for the in-index-2-block offset.
pub const INDEX_2_MASK: u32 = INDEX_2_BLOCK_LENGTH - 1;
/// Number of code points per index-2 table entry. 512=0x200
pub const CP_PER_INDEX_2_ENTRY: u32 = 1 << SHIFT_2;
/// Number of entries in an index-3 block. 32=0x20
pub const INDEX_3_BLOCK_LENGTH: u32 = 1 << SHIFT_2_3;
/// Mask for getting the lower bits for the in-index-3-block offset.
pub const INDEX_3_MASK: u32 = INDEX_3_BLOCK_LENGTH - 1;
/// Number of entries in a small data block. 16=0x10
pub const SMALL_DATA_BLOCK_LENGTH: u32 = 1 << SHIFT_3;
/// Mask for getting the lower bits for the in-small-data-block offset.
pub const SMALL_DATA_MASK: u32 = SMALL_DATA_BLOCK_LENGTH - 1;
pub const CODE_POINT_MAX: u32 = 0x10ffff;

View File

@@ -0,0 +1,53 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module provides a data structure for an time-efficient lookup of values
//! associated to code points.
//!
//! It is an implementation of the existing [ICU4C UCPTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ucptrie_8h.html)
//! / [ICU4J CodePointTrie](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/) API.
//!
//! # Architecture
//!
//! ICU4X [`CodePointTrie`] is designed to provide a read-only view of [`CodePointTrie`] data that is exported
//! from ICU4C. Detailed information about the design of the data structure can be found in the documentation
//! for the [`CodePointTrie`] struct.
//!
//! # Examples
//!
//! ## Querying a `CodePointTrie`
//!
//! ```
//! use icu::collections::codepointtrie::planes;
//! let trie = planes::get_planes_trie();
//!
//! assert_eq!(0, trie.get32(0x41)); // 'A' as u32
//! assert_eq!(0, trie.get32(0x13E0)); // 'Ꮰ' as u32
//! assert_eq!(1, trie.get32(0x10044)); // '𐁄' as u32
//! ```
//!
//! [`ICU4X`]: ../icu/index.html
mod cptrie;
mod error;
mod impl_const;
pub mod planes;
#[cfg(feature = "serde")]
pub mod toml;
#[cfg(feature = "serde")]
mod serde;
pub use cptrie::CodePointMapRange;
pub use cptrie::CodePointMapRangeIterator;
pub use cptrie::CodePointTrie;
pub use cptrie::CodePointTrieHeader;
pub use cptrie::FastCodePointTrie;
pub use cptrie::SmallCodePointTrie;
pub use cptrie::TrieType;
pub use cptrie::TrieValue;
pub use cptrie::Typed;
pub use cptrie::TypedCodePointTrie;
pub use error::Error as CodePointTrieError;

View File

@@ -0,0 +1,296 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Sample data for [`CodePointTrie`] that returns the code point's plane number.
use crate::codepointtrie::cptrie::*;
use zerovec::ZeroVec;
const INDEX_ARRAY_AS_BYTES: &[u8] = &[
0x0, 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x88, 0x2, 0x90, 0x2,
0x90, 0x2, 0x90, 0x2, 0xb0, 0x2, 0xb0, 0x2, 0xb0, 0x2, 0xb0, 0x2, 0xd0, 0x2, 0xd0, 0x2, 0xd0,
0x2, 0xd0, 0x2, 0xf0, 0x2, 0xf0, 0x2, 0xf0, 0x2, 0xf0, 0x2, 0x10, 0x3, 0x10, 0x3, 0x10, 0x3,
0x10, 0x3, 0x30, 0x3, 0x30, 0x3, 0x30, 0x3, 0x30, 0x3, 0x50, 0x3, 0x50, 0x3, 0x50, 0x3, 0x50,
0x3, 0x70, 0x3, 0x70, 0x3, 0x70, 0x3, 0x70, 0x3, 0x90, 0x3, 0x90, 0x3, 0x90, 0x3, 0x90, 0x3,
0xb0, 0x3, 0xb0, 0x3, 0xb0, 0x3, 0xb0, 0x3, 0xd0, 0x3, 0xd0, 0x3, 0xd0, 0x3, 0xd0, 0x3, 0xf0,
0x3, 0xf0, 0x3, 0xf0, 0x3, 0xf0, 0x3, 0x10, 0x4, 0x10, 0x4, 0x10, 0x4, 0x10, 0x4, 0x30, 0x4,
0x30, 0x4, 0x30, 0x4, 0x30, 0x4, 0x50, 0x4, 0x50, 0x4, 0x50, 0x4, 0x50, 0x4, 0x70, 0x4, 0x70,
0x4, 0x70, 0x4, 0x70, 0x4, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0, 0x40, 0x0, 0x50, 0x0,
0x60, 0x0, 0x70, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20,
0x0, 0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0,
0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30,
0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0, 0x0, 0x0, 0x10, 0x0, 0x20, 0x0, 0x30, 0x0,
0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80,
0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0,
0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80,
0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x80, 0x0, 0x90, 0x0, 0x90, 0x0,
0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90,
0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0,
0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90,
0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0x90, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0,
0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0,
0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0,
0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0, 0x0, 0xa0,
0x0, 0xa0, 0x0, 0xa0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0,
0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0,
0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0,
0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0, 0x0, 0xb0,
0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0,
0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0,
0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0,
0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xc0, 0x0, 0xd0, 0x0, 0xd0,
0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0,
0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0,
0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0,
0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xd0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0,
0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0,
0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0,
0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0,
0xe0, 0x0, 0xe0, 0x0, 0xe0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0,
0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0,
0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0,
0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0, 0xf0, 0x0,
0xf0, 0x0, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0,
0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1,
0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0,
0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10,
0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1,
0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10,
0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1, 0x10, 0x1,
0x10, 0x1, 0x10, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20,
0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1,
0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20,
0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1, 0x20, 0x1,
0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30,
0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1,
0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30,
0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x30, 0x1, 0x40, 0x1, 0x40, 0x1,
0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40,
0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1,
0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40,
0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1,
0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50,
0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1,
0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50, 0x1, 0x50,
0x1, 0x50, 0x1, 0x50, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1,
0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60,
0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1,
0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60, 0x1, 0x60,
0x1, 0x80, 0x0, 0x88, 0x0, 0x88, 0x0, 0x88, 0x0, 0x88, 0x0, 0x88, 0x0, 0x88, 0x0, 0x88, 0x0,
0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2,
0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0,
0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0x2,
0x0, 0x2, 0x0, 0x2, 0x0, 0x2, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8,
0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0,
0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8,
0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0, 0xa8, 0x0,
0xa8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8,
0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0,
0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8,
0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xc8, 0x0, 0xe8, 0x0,
0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8,
0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0,
0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8,
0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0xe8, 0x0, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8,
0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1,
0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8,
0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1, 0x8, 0x1,
0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28,
0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1,
0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28,
0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x28, 0x1, 0x48, 0x1, 0x48, 0x1,
0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48,
0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1,
0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48,
0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x48, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1,
0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68,
0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1,
0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68, 0x1, 0x68,
0x1, 0x68, 0x1, 0x68, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1,
0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88,
0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1,
0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88, 0x1, 0x88,
0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1,
0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8,
0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1,
0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xa8, 0x1, 0xc8, 0x1, 0xc8,
0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1,
0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8,
0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1,
0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xc8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8,
0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1,
0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8,
0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1,
0xe8, 0x1, 0xe8, 0x1, 0xe8, 0x1, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2,
0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8,
0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2,
0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x8, 0x2, 0x28, 0x2, 0x28, 0x2,
0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28,
0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2,
0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28,
0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x28, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2,
0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48,
0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2,
0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48, 0x2, 0x48,
0x2, 0x48, 0x2, 0x48, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2,
0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68,
0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2,
0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68, 0x2, 0x68,
0x2,
];
/// Return a [`CodePointTrie`] that returns the Unicode plane number, an
/// integer from 0-16 inclusive, for each code point.
///
/// This `CodePointTrie`
/// does not actually represent any Unicode property, but it is provided in
/// case it is useful to users of `CodePointTrie` for testing or other
/// purposes. See <https://www.unicode.org/glossary/#plane>.
pub fn get_planes_trie() -> CodePointTrie<'static, u8> {
let index_array_as_bytes: &[u8] = INDEX_ARRAY_AS_BYTES;
let data_8_array: &[u8] = &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb,
0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc,
0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd,
0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe,
0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0,
];
#[expect(clippy::unwrap_used)] // valid bytes
let index: ZeroVec<u16> = ZeroVec::parse_bytes(index_array_as_bytes).unwrap();
#[expect(clippy::unwrap_used)] // valid bytes
let data: ZeroVec<u8> = ZeroVec::parse_bytes(data_8_array).unwrap();
let high_start = 0x100000;
let shifted12_high_start = 0x100;
let index3_null_offset = 0x2;
let data_null_offset = 0x0;
let null_value = 0x0;
let trie_type = TrieType::Small;
let trie_header = CodePointTrieHeader {
high_start,
shifted12_high_start,
index3_null_offset,
data_null_offset,
null_value,
trie_type,
};
#[expect(clippy::unwrap_used)] // valid data
CodePointTrie::try_new(trie_header, index, data).unwrap()
}
#[cfg(test)]
mod tests {
use zerovec::ZeroVec;
const INDEX_ARRAY: &[u16] = &[
0, 0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0x288, 0x290, 0x290, 0x290, 0x2b0, 0x2b0, 0x2b0, 0x2b0, 0x2d0, 0x2d0, 0x2d0,
0x2d0, 0x2f0, 0x2f0, 0x2f0, 0x2f0, 0x310, 0x310, 0x310, 0x310, 0x330, 0x330, 0x330, 0x330,
0x350, 0x350, 0x350, 0x350, 0x370, 0x370, 0x370, 0x370, 0x390, 0x390, 0x390, 0x390, 0x3b0,
0x3b0, 0x3b0, 0x3b0, 0x3d0, 0x3d0, 0x3d0, 0x3d0, 0x3f0, 0x3f0, 0x3f0, 0x3f0, 0x410, 0x410,
0x410, 0x410, 0x430, 0x430, 0x430, 0x430, 0x450, 0x450, 0x450, 0x450, 0x470, 0x470, 0x470,
0x470, 0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0, 0x10, 0x20, 0x30, 0, 0x10, 0x20,
0x30, 0, 0x10, 0x20, 0x30, 0, 0x10, 0x20, 0x30, 0, 0x10, 0x20, 0x30, 0, 0x10, 0x20, 0x30,
0, 0x10, 0x20, 0x30, 0, 0x10, 0x20, 0x30, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0xa0, 0xa0, 0xa0, 0xa0,
0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0,
0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xa0, 0xb0, 0xb0,
0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0, 0xb0,
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
0xc0, 0xc0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0,
0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0, 0xd0,
0xd0, 0xd0, 0xd0, 0xd0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100,
0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100,
0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100,
0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110,
0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x110,
0x110, 0x110, 0x110, 0x110, 0x110, 0x110, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120,
0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120,
0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x120, 0x130,
0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130,
0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130, 0x130,
0x130, 0x130, 0x130, 0x130, 0x130, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140,
0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140,
0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x140, 0x150, 0x150,
0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150,
0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150, 0x150,
0x150, 0x150, 0x150, 0x150, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160,
0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160,
0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x160, 0x80, 0x88, 0x88,
0x88, 0x88, 0x88, 0x88, 0x88, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8,
0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8,
0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xa8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8,
0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8,
0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xc8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8,
0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8,
0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0xe8, 0x108, 0x108,
0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108,
0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108, 0x108,
0x108, 0x108, 0x108, 0x108, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128,
0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128,
0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x128, 0x148, 0x148, 0x148,
0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148,
0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148, 0x148,
0x148, 0x148, 0x148, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168,
0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168,
0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x168, 0x188, 0x188, 0x188, 0x188,
0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188,
0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188, 0x188,
0x188, 0x188, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8,
0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8,
0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1a8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8,
0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8,
0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8, 0x1c8,
0x1c8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8,
0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8,
0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x1e8, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208,
0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208,
0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208, 0x208,
0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228,
0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x228,
0x228, 0x228, 0x228, 0x228, 0x228, 0x228, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248,
0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248,
0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x248, 0x268,
0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268,
0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268, 0x268,
0x268, 0x268, 0x268, 0x268, 0x268,
];
#[test]
fn test_index_byte_array_literal() {
let index_array_as_bytes: &[u8] = super::INDEX_ARRAY_AS_BYTES;
let index_zv_bytes: ZeroVec<u16> =
ZeroVec::parse_bytes(index_array_as_bytes).expect("infallible");
let index_zv_aligned: ZeroVec<u16> = ZeroVec::from_slice_or_alloc(INDEX_ARRAY);
assert_eq!(index_zv_bytes, index_zv_aligned);
}
}

View File

@@ -0,0 +1,74 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::codepointtrie::{CodePointTrie, CodePointTrieHeader, TrieValue};
use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer};
use zerofrom::ZeroFrom;
use zerovec::ZeroVec;
#[derive(Serialize, Deserialize)]
pub struct CodePointTrieSerde<'trie, T: TrieValue> {
header: CodePointTrieHeader,
#[serde(borrow)]
index: ZeroVec<'trie, u16>,
#[serde(borrow)]
data: ZeroVec<'trie, T>,
}
impl<T: TrieValue + Serialize> Serialize for CodePointTrie<'_, T> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let ser = CodePointTrieSerde {
header: self.header,
index: ZeroFrom::zero_from(&self.index),
data: ZeroFrom::zero_from(&self.data),
};
ser.serialize(serializer)
}
}
impl<'de, 'trie, T: TrieValue + Deserialize<'de>> Deserialize<'de> for CodePointTrie<'trie, T>
where
'de: 'trie,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let de = CodePointTrieSerde::deserialize(deserializer)?;
// SAFETY:
// `validate_fields` upholds the invariants for the fields that
// fast-path access without bound checks relies on.
let error_value = match CodePointTrie::validate_fields(&de.header, &de.index, &de.data) {
Ok(v) => v,
Err(e) => {
match e {
super::CodePointTrieError::FromDeserialized { reason } => {
// Not supposed to be returned by `validate_fields`.
debug_assert!(false);
return Err(D::Error::custom(reason));
}
super::CodePointTrieError::EmptyDataVector => {
return Err(D::Error::custom("CodePointTrie must be constructed from data vector with at least one element"));
}
super::CodePointTrieError::IndexTooShortForFastAccess => {
return Err(D::Error::custom("CodePointTrie must be constructed from index vector long enough to accommodate fast-path access"));
}
super::CodePointTrieError::DataTooShortForFastAccess => {
return Err(D::Error::custom("CodePointTrie must be constructed from data vector long enough to accommodate fast-path access"));
}
}
}
};
// Field invariants upheld: Checked by `validate_fields` above.
Ok(CodePointTrie {
header: de.header,
index: de.index,
data: de.data,
error_value,
})
}
}

View File

@@ -0,0 +1,123 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Utilities for reading CodePointTrie data from TOML files.
use crate::codepointtrie::error::Error;
use crate::codepointtrie::CodePointTrie;
use crate::codepointtrie::CodePointTrieHeader;
use crate::codepointtrie::TrieType;
use crate::codepointtrie::TrieValue;
use alloc::string::String;
use alloc::vec::Vec;
use core::convert::TryFrom;
use zerovec::ZeroVec;
/// A Serde-compatible struct for reading serialized [`CodePointTrie`] TOML files
/// generated by ICU4C.
///
/// Use `TryInto` to convert [`CodePointTrieToml`] to a proper [`CodePointTrie`].
#[derive(serde::Deserialize)]
pub struct CodePointTrieToml {
#[serde(skip)]
_short_name: String,
#[serde(skip)]
_long_name: String,
#[serde(skip)]
_name: String,
index: Vec<u16>,
data_8: Option<Vec<u8>>,
data_16: Option<Vec<u16>>,
data_32: Option<Vec<u32>>,
#[serde(skip)]
_index_length: u32,
#[serde(skip)]
_data_length: u32,
#[serde(rename = "highStart")]
high_start: u32,
#[serde(rename = "shifted12HighStart")]
shifted12_high_start: u16,
#[serde(rename = "type")]
trie_type_enum_val: u8,
#[serde(rename = "valueWidth")]
_value_width_enum_val: u8,
#[serde(rename = "index3NullOffset")]
index3_null_offset: u16,
#[serde(rename = "dataNullOffset")]
data_null_offset: u32,
#[serde(rename = "nullValue")]
null_value: u32,
}
/// Data slice from a [`CodePointTrie`] TOML.
///
/// ICU4C exports data as either `u8`, `u16`, or `u32`, which may be converted
/// to other types as appropriate.
#[allow(clippy::exhaustive_enums)] // based on a stable serialized form
pub enum CodePointDataSlice<'a> {
/// A serialized [`CodePointTrie`] data array 8-bit values.
U8(&'a [u8]),
/// A serialized [`CodePointTrie`] data array 16-bit values.
U16(&'a [u16]),
/// A serialized [`CodePointTrie`] data array 32-bit values.
U32(&'a [u32]),
}
impl CodePointTrieToml {
/// Gets the `index` slice.
pub fn index_slice(&self) -> &[u16] {
self.index.as_slice()
}
/// Gets the `data` slice.
pub fn data_slice(&self) -> Result<CodePointDataSlice<'_>, Error> {
if let Some(data_8) = &self.data_8 {
Ok(CodePointDataSlice::U8(data_8.as_slice()))
} else if let Some(data_16) = &self.data_16 {
Ok(CodePointDataSlice::U16(data_16.as_slice()))
} else if let Some(data_32) = &self.data_32 {
Ok(CodePointDataSlice::U32(data_32.as_slice()))
} else {
Err(Error::FromDeserialized {
reason: "Did not find data array for CodePointTrie in TOML",
})
}
}
}
impl TryFrom<&CodePointTrieToml> for CodePointTrieHeader {
type Error = Error;
fn try_from(cpt_data: &CodePointTrieToml) -> Result<Self, Self::Error> {
let trie_type_enum: TrieType = TrieType::try_from(cpt_data.trie_type_enum_val)?;
Ok(CodePointTrieHeader {
high_start: cpt_data.high_start,
shifted12_high_start: cpt_data.shifted12_high_start,
index3_null_offset: cpt_data.index3_null_offset,
data_null_offset: cpt_data.data_null_offset,
null_value: cpt_data.null_value,
trie_type: trie_type_enum,
})
}
}
impl<T: TrieValue> TryFrom<&CodePointTrieToml> for CodePointTrie<'static, T> {
type Error = Error;
fn try_from(cpt_data: &CodePointTrieToml) -> Result<CodePointTrie<'static, T>, Self::Error> {
use CodePointDataSlice::*;
let header = CodePointTrieHeader::try_from(cpt_data)?;
let index: ZeroVec<u16> = ZeroVec::alloc_from_slice(&cpt_data.index);
let data: Result<ZeroVec<'static, T>, T::TryFromU32Error> = match cpt_data.data_slice()? {
U8(s) => s.iter().map(|i| T::try_from_u32(*i as u32)).collect(),
U16(s) => s.iter().map(|i| T::try_from_u32(*i as u32)).collect(),
U32(s) => s.iter().map(|i| T::try_from_u32(*i)).collect(),
};
let data = data.map_err(|_| Error::FromDeserialized {
reason: "Could not parse data array to typed array",
})?;
CodePointTrie::<T>::try_new(header, index, data)
}
}

View File

@@ -0,0 +1,188 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::codepointtrie::CodePointMapRange;
/// This is an iterator that coalesces adjacent ranges in an iterator over code
/// point ranges
pub(crate) struct RangeListIteratorCoalescer<I, T> {
iter: I,
peek: Option<CodePointMapRange<T>>,
}
impl<I, T: Eq> RangeListIteratorCoalescer<I, T>
where
I: Iterator<Item = CodePointMapRange<T>>,
{
pub fn new(iter: I) -> Self {
Self { iter, peek: None }
}
}
impl<I, T: Eq> Iterator for RangeListIteratorCoalescer<I, T>
where
I: Iterator<Item = CodePointMapRange<T>>,
{
type Item = CodePointMapRange<T>;
fn next(&mut self) -> Option<Self::Item> {
// Get the initial range we're working with: either a leftover
// range from last time, or the next range
let mut ret = if let Some(peek) = self.peek.take() {
peek
} else if let Some(next) = self.iter.next() {
next
} else {
// No ranges, exit early
return None;
};
// Keep pulling ranges
#[expect(clippy::while_let_on_iterator)]
// can't move the iterator, also we want it to be explicit that we're not draining the iterator
while let Some(next) = self.iter.next() {
if *next.range.start() == ret.range.end() + 1 && next.value == ret.value {
// Range has no gap, coalesce
ret.range = *ret.range.start()..=*next.range.end();
} else {
// Range has a gap, return what we have so far, update
// peek
self.peek = Some(next);
return Some(ret);
}
}
// Ran out of elements, exit
Some(ret)
}
}
#[cfg(test)]
mod tests {
use core::fmt::Debug;
use icu::collections::codepointinvlist::CodePointInversionListBuilder;
use icu::properties::props::{BinaryProperty, EnumeratedProperty};
use icu::properties::{CodePointMapData, CodePointSetData};
fn test_set<P: BinaryProperty>(name: &str) {
let mut builder = CodePointInversionListBuilder::new();
let mut builder_complement = CodePointInversionListBuilder::new();
for range in CodePointSetData::new::<P>().iter_ranges() {
builder.add_range32(range)
}
for range in CodePointSetData::new::<P>().iter_ranges_complemented() {
builder_complement.add_range32(range)
}
builder.complement();
let set1 = builder.build();
let set2 = builder_complement.build();
assert_eq!(set1, set2, "Set {name} failed to complement correctly");
}
fn test_map<T: EnumeratedProperty + Debug>(value: T, name: &str) {
let mut builder = CodePointInversionListBuilder::new();
let mut builder_complement = CodePointInversionListBuilder::new();
for range in CodePointMapData::<T>::new().iter_ranges_for_value(value) {
builder.add_range32(range)
}
for range in CodePointMapData::<T>::new().iter_ranges_for_value_complemented(value) {
builder_complement.add_range32(range)
}
builder.complement();
let set1 = builder.build();
let set2 = builder_complement.build();
assert_eq!(
set1, set2,
"Map {name} failed to complement correctly with value {value:?}"
);
}
#[test]
fn test_complement_sets() {
use icu::properties::props::*;
// Stress test the RangeListIteratorComplementer logic by ensuring it works for
// a whole bunch of binary properties
test_set::<AsciiHexDigit>("ASCII_Hex_Digit");
test_set::<Alnum>("Alnum");
test_set::<Alphabetic>("Alphabetic");
test_set::<BidiControl>("Bidi_Control");
test_set::<BidiMirrored>("Bidi_Mirrored");
test_set::<Blank>("Blank");
test_set::<Cased>("Cased");
test_set::<CaseIgnorable>("Case_Ignorable");
test_set::<FullCompositionExclusion>("Full_Composition_Exclusion");
test_set::<ChangesWhenCasefolded>("Changes_When_Casefolded");
test_set::<ChangesWhenCasemapped>("Changes_When_Casemapped");
test_set::<ChangesWhenNfkcCasefolded>("Changes_When_NFKC_Casefolded");
test_set::<ChangesWhenLowercased>("Changes_When_Lowercased");
test_set::<ChangesWhenTitlecased>("Changes_When_Titlecased");
test_set::<ChangesWhenUppercased>("Changes_When_Uppercased");
test_set::<Dash>("Dash");
test_set::<Deprecated>("Deprecated");
test_set::<DefaultIgnorableCodePoint>("Default_Ignorable_Code_Point");
test_set::<Diacritic>("Diacritic");
test_set::<EmojiModifierBase>("Emoji_Modifier_Base");
test_set::<EmojiComponent>("Emoji_Component");
test_set::<EmojiModifier>("Emoji_Modifier");
test_set::<Emoji>("Emoji");
test_set::<EmojiPresentation>("Emoji_Presentation");
test_set::<Extender>("Extender");
test_set::<ExtendedPictographic>("Extended_Pictographic");
test_set::<Graph>("Graph");
test_set::<GraphemeBase>("Grapheme_Base");
test_set::<GraphemeExtend>("Grapheme_Extend");
test_set::<GraphemeLink>("Grapheme_Link");
test_set::<HexDigit>("Hex_Digit");
test_set::<Hyphen>("Hyphen");
test_set::<IdContinue>("Id_Continue");
test_set::<Ideographic>("Ideographic");
test_set::<IdStart>("Id_Start");
test_set::<IdsBinaryOperator>("Ids_Binary_Operator");
test_set::<IdsTrinaryOperator>("Ids_Trinary_Operator");
test_set::<JoinControl>("Join_Control");
test_set::<LogicalOrderException>("Logical_Order_Exception");
test_set::<Lowercase>("Lowercase");
test_set::<Math>("Math");
test_set::<NoncharacterCodePoint>("Noncharacter_Code_Point");
test_set::<NfcInert>("NFC_Inert");
test_set::<NfdInert>("NFD_Inert");
test_set::<NfkcInert>("NFKC_Inert");
test_set::<NfkdInert>("NFKD_Inert");
test_set::<PatternSyntax>("Pattern_Syntax");
test_set::<PatternWhiteSpace>("Pattern_White_Space");
test_set::<PrependedConcatenationMark>("Prepended_Concatenation_Mark");
test_set::<Print>("Print");
test_set::<QuotationMark>("Quotation_Mark");
test_set::<Radical>("Radical");
test_set::<RegionalIndicator>("Regional_Indicator");
test_set::<SoftDotted>("Soft_Dotted");
test_set::<SegmentStarter>("Segment_Starter");
test_set::<CaseSensitive>("Case_Sensitive");
test_set::<SentenceTerminal>("Sentence_Terminal");
test_set::<TerminalPunctuation>("Terminal_Punctuation");
test_set::<UnifiedIdeograph>("Unified_Ideograph");
test_set::<Uppercase>("Uppercase");
test_set::<VariationSelector>("Variation_Selector");
test_set::<WhiteSpace>("White_Space");
test_set::<Xdigit>("Xdigit");
test_set::<XidContinue>("XID_Continue");
test_set::<XidStart>("XID_Start");
}
#[test]
fn test_complement_maps() {
use icu::properties::props::{GeneralCategory, Script};
test_map(GeneralCategory::UppercaseLetter, "gc");
test_map(GeneralCategory::OtherPunctuation, "gc");
test_map(Script::Devanagari, "script");
test_map(Script::Latin, "script");
test_map(Script::Common, "script");
}
}

44
vendor/icu_collections/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,44 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Efficient collections for Unicode data.
//!
//! This module is published as its own crate ([`icu_collections`](https://docs.rs/icu_collections/latest/icu_collections/))
//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
//!
//! ICU4X [`CodePointTrie`](crate::codepointtrie::CodePointTrie) provides a read-only view of `CodePointTrie` data that is exported
//! from ICU4C. Detailed information about the design of the data structure can be found in the documentation
//! for the [`CodePointTrie`](crate::codepointtrie::CodePointTrie) struct.
//!
//! ICU4X [`CodePointInversionList`](`crate::codepointinvlist::CodePointInversionList`) provides necessary functionality for highly efficient querying of sets of Unicode characters.
//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
//!
//! ICU4X [`Char16Trie`](`crate::char16trie::Char16Trie`) provides a data structure for a space-efficient and time-efficient lookup of
//! sequences of 16-bit units (commonly but not necessarily UTF-16 code units)
//! which map to integer values.
//! It is an implementation of the existing [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
//! / [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
pub mod char16trie;
pub mod codepointinvlist;
pub mod codepointinvliststringlist;
pub mod codepointtrie;
pub(crate) mod iterator_utils;