chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,29 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Intermediate metadata for a branch node under construction.
#[derive(Debug, Clone, Copy)]
pub(crate) struct BranchMeta {
/// The lead byte for this branch. Formerly it was required to be an ASCII byte, but now
/// it can be any byte.
pub ascii: u8,
/// The size in bytes of the trie data reachable from this branch.
pub local_length: usize,
/// The size in bytes of this and all later sibling branches.
pub cumulative_length: usize,
/// The number of later sibling branches, including this.
pub count: usize,
}
impl BranchMeta {
/// Creates a new empty [`BranchMeta`].
pub const fn default() -> Self {
BranchMeta {
ascii: 0,
cumulative_length: 0,
local_length: 0,
count: 0,
}
}
}

128
vendor/zerotrie/src/builder/bytestr.rs vendored Normal file
View File

@@ -0,0 +1,128 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
#[cfg(feature = "serde")]
use alloc::boxed::Box;
/// A struct transparent over `[u8]` with convenient helper functions.
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct ByteStr([u8]);
impl ByteStr {
pub const fn from_byte_slice_with_value<'a, 'l>(
input: &'l [(&'a [u8], usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
pub const fn from_str_slice_with_value<'a, 'l>(
input: &'l [(&'a str, usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: str and ByteStr have the same layout, and ByteStr is less restrictive
unsafe { core::mem::transmute(input) }
}
pub fn from_bytes(input: &[u8]) -> &Self {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
#[cfg(feature = "serde")]
pub fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
#[allow(dead_code)] // may want this in the future
pub fn from_str(input: &str) -> &Self {
Self::from_bytes(input.as_bytes())
}
#[allow(dead_code)] // may want this in the future
pub fn empty() -> &'static Self {
Self::from_bytes(&[])
}
#[allow(dead_code)] // not used in all features
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}
pub const fn len(&self) -> usize {
self.0.len()
}
#[allow(dead_code)] // not used in all features
pub fn is_all_ascii(&self) -> bool {
for byte in self.0.iter() {
if !byte.is_ascii() {
return false;
}
}
true
}
#[allow(dead_code)] // may want this in the future
pub(crate) fn byte_at(&self, index: usize) -> Option<u8> {
self.0.get(index).copied()
}
/// Returns the byte at the given index, panicking if out of bounds.
#[allow(clippy::indexing_slicing)] // "panic" is in method name
pub(crate) const fn byte_at_or_panic(&self, index: usize) -> u8 {
self.0[index]
}
/// Const function to evaluate `self < other`.
#[allow(clippy::indexing_slicing)] // in-range loop conditions
pub(crate) const fn is_less_then(&self, other: &Self) -> bool {
let mut i = 0;
while i < self.len() && i < other.len() {
if self.0[i] < other.0[i] {
return true;
}
if self.0[i] > other.0[i] {
return false;
}
i += 1;
}
self.len() < other.len()
}
/// Const function to evaluate `self[..prefix_len] == other[..prefix_len]`
///
/// # Panics
///
/// Panics if `prefix_len` is longer than either this string or the other string
#[allow(clippy::indexing_slicing)] // in-range loop conditions
pub(crate) const fn prefix_eq(&self, other: &ByteStr, prefix_len: usize) -> bool {
assert!(prefix_len <= self.len());
assert!(prefix_len <= other.len());
let mut i = 0;
while i < prefix_len {
if self.0[i] != other.0[i] {
return false;
}
i += 1;
}
true
}
}
impl Borrow<[u8]> for ByteStr {
fn borrow(&self) -> &[u8] {
self.as_bytes()
}
}
#[cfg(feature = "alloc")]
impl Borrow<[u8]> for alloc::boxed::Box<ByteStr> {
fn borrow(&self) -> &[u8] {
self.as_bytes()
}
}

View File

@@ -0,0 +1,338 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::super::branch_meta::BranchMeta;
use super::super::bytestr::ByteStr;
use super::store::const_for_each;
use super::store::ConstArrayBuilder;
use super::store::ConstLengthsStack;
use super::store::ConstSlice;
use crate::error::ZeroTrieBuildError;
use crate::varint;
/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
///
/// All methods that grow the trie will panic if the capacity N is not enough.
pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
data: ConstArrayBuilder<N, u8>,
}
impl<const N: usize> ZeroTrieBuilderConst<N> {
/// Non-const function that returns the current trie data as a slice.
#[cfg(feature = "litemap")]
pub fn as_bytes(&self) -> &[u8] {
self.data.as_const_slice().as_slice()
}
/// Returns the trie data, panicking if the buffer is the wrong size.
pub const fn build_or_panic(self) -> [u8; N] {
self.data.const_build_or_panic()
}
/// Creates a new empty builder.
pub const fn new() -> Self {
Self {
data: ConstArrayBuilder::new_empty([0; N], N),
}
}
/// Prepends an ASCII node to the front of the builder. Returns the new builder
/// and the delta in length, which is always 1.
#[must_use]
const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
if ascii >= 128 {
panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
}
let data = self.data.const_push_front_or_panic(ascii);
(Self { data }, 1)
}
/// Prepends a value node to the front of the builder. Returns the new builder
/// and the delta in length, which depends on the size of the varint.
#[must_use]
const fn prepend_value(self, value: usize) -> (Self, usize) {
let mut data = self.data;
let varint_array = varint::write_varint_meta3(value);
// Can panic (as documented in class docs):
data = data.const_extend_front_or_panic(varint_array.as_const_slice());
// Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
data = data.const_bitor_assign_or_panic(0, 0b10000000);
(Self { data }, varint_array.len())
}
/// Prepends a branch node to the front of the builder. Returns the new builder
/// and the delta in length, which depends on the size of the varint.
#[must_use]
const fn prepend_branch(self, value: usize) -> (Self, usize) {
let mut data = self.data;
let varint_array = varint::write_varint_meta2(value);
// Can panic (as documented in type-level docs):
data = data.const_extend_front_or_panic(varint_array.as_const_slice());
// Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
data = data.const_bitor_assign_or_panic(0, 0b11000000);
(Self { data }, varint_array.len())
}
/// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
/// and the delta in length, which is the length of the slice.
#[must_use]
const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
let mut data = self.data;
let mut i = s.len();
while i > 0 {
// Can panic (as documented in type-level docs):
data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
i -= 1;
}
(Self { data }, s.len())
}
/// Prepends multiple zeros to the front of the builder. Returns the new builder.
#[must_use]
const fn prepend_n_zeros(self, n: usize) -> Self {
let mut data = self.data;
let mut i = 0;
while i < n {
// Can panic (as documented in type-level docs):
data = data.const_push_front_or_panic(0);
i += 1;
}
Self { data }
}
/// Performs the operation `self[index] |= bits`
const fn bitor_assign_at_or_panic(self, index: usize, bits: u8) -> Self {
let mut data = self.data;
data = data.const_bitor_assign_or_panic(index, bits);
Self { data }
}
/// Creates a new builder containing the elements in the given slice of key/value pairs.
///
/// `K` is the stack size of the lengths stack. If you get an error such as
/// "AsciiTrie Builder: Need more stack", try increasing `K`.
///
/// # Panics
///
/// Panics if the items are not sorted
pub const fn from_tuple_slice<'a, const K: usize>(
items: &[(&'a ByteStr, usize)],
) -> Result<Self, ZeroTrieBuildError> {
let items = ConstSlice::from_slice(items);
let mut prev: Option<&'a ByteStr> = None;
const_for_each!(items, (ascii_str, _), {
match prev {
None => (),
Some(prev) => {
if !prev.is_less_then(ascii_str) {
panic!("Strings in ByteStr constructor are not sorted");
}
}
};
prev = Some(ascii_str)
});
Self::from_sorted_const_tuple_slice::<K>(items)
}
/// Creates a new builder containing the elements in the given slice of key/value pairs.
///
/// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
///
/// `K` is the stack size of the lengths stack. If you get an error such as
/// "AsciiTrie Builder: Need more stack", try increasing `K`.
pub const fn from_sorted_const_tuple_slice<const K: usize>(
items: ConstSlice<(&ByteStr, usize)>,
) -> Result<Self, ZeroTrieBuildError> {
let mut result = Self::new();
let total_size;
(result, total_size) = result.create_or_panic::<K>(items);
debug_assert!(total_size == result.data.len());
Ok(result)
}
/// The actual builder algorithm. For an explanation, see [`crate::builder`].
#[must_use]
const fn create_or_panic<const K: usize>(
mut self,
all_items: ConstSlice<(&ByteStr, usize)>,
) -> (Self, usize) {
let mut prefix_len = match all_items.last() {
Some(x) => x.0.len(),
// Empty slice:
None => return (Self::new(), 0),
};
// Initialize the main loop to point at the last string.
let mut lengths_stack = ConstLengthsStack::<K>::new();
let mut i = all_items.len() - 1;
let mut j = all_items.len();
let mut current_len = 0;
// Start the main loop.
loop {
let item_i = all_items.get_or_panic(i);
let item_j = all_items.get_or_panic(j - 1);
debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
// Check if we need to add a value node here.
if item_i.0.len() == prefix_len {
let len;
(self, len) = self.prepend_value(item_i.1);
current_len += len;
}
if prefix_len == 0 {
// All done! Leave the main loop.
break;
}
// Reduce the prefix length by 1 and recalculate i and j.
prefix_len -= 1;
let mut new_i = i;
let mut new_j = j;
let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
debug_assert!(ascii_i == ascii_j);
let key_ascii = ascii_i;
loop {
if new_i == 0 {
break;
}
let candidate = all_items.get_or_panic(new_i - 1).0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_i.0.prefix_eq(candidate, prefix_len) {
new_i -= 1;
} else {
break;
}
if candidate.len() == prefix_len {
// A string that equals the prefix does not take part in the branch node.
break;
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_i {
ascii_i = candidate;
}
}
loop {
if new_j == all_items.len() {
break;
}
let candidate = all_items.get_or_panic(new_j).0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_j.0.prefix_eq(candidate, prefix_len) {
new_j += 1;
} else {
break;
}
if candidate.len() == prefix_len {
panic!("A shorter string should be earlier in the sequence");
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_j {
ascii_j = candidate;
}
}
// If there are no different bytes at this prefix level, we can add an ASCII or Span
// node and then continue to the next iteration of the main loop.
if ascii_i == key_ascii && ascii_j == key_ascii {
let len;
(self, len) = self.prepend_ascii(ascii_i);
current_len += len;
debug_assert!(i == new_i || i == new_i + 1);
i = new_i;
debug_assert!(j == new_j);
continue;
}
// If i and j changed, we are a target of a branch node.
if ascii_j == key_ascii {
// We are the _last_ target of a branch node.
lengths_stack = lengths_stack.push_or_panic(BranchMeta {
ascii: key_ascii,
cumulative_length: current_len,
local_length: current_len,
count: 1,
});
} else {
// We are the _not the last_ target of a branch node.
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
lengths_stack = lengths_stack.push_or_panic(BranchMeta {
ascii: key_ascii,
cumulative_length: cumulative_length + current_len,
local_length: current_len,
count: count + 1,
});
}
if ascii_i != key_ascii {
// We are _not the first_ target of a branch node.
// Set the cursor to the previous string and continue the loop.
j = i;
i -= 1;
prefix_len = all_items.get_or_panic(i).0.len();
current_len = 0;
continue;
}
// Branch (first)
let (total_length, total_count) = {
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
(cumulative_length, count)
};
let branch_metas;
(lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
let original_keys = branch_metas.map_to_ascii_bytes();
// Write out the offset table
current_len = total_length;
const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
if w > 3 {
panic!("ZeroTrie capacity exceeded");
}
let mut k = 0;
while k <= w {
self = self.prepend_n_zeros(total_count - 1);
current_len += total_count - 1;
let mut l = 0;
let mut length_to_write = 0;
while l < total_count {
let BranchMeta { local_length, .. } = *branch_metas
.as_const_slice()
.get_or_panic(total_count - l - 1);
let mut adjusted_length = length_to_write;
let mut m = 0;
while m < k {
adjusted_length >>= 8;
m += 1;
}
if l > 0 {
self = self.bitor_assign_at_or_panic(l - 1, adjusted_length as u8);
}
l += 1;
length_to_write += local_length;
}
k += 1;
}
// Write out the lookup table
assert!(0 < total_count && total_count <= 256);
let branch_value = (w << 8) + (total_count & 0xff);
let slice_len;
(self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
let branch_len;
(self, branch_len) = self.prepend_branch(branch_value);
current_len += slice_len + branch_len;
i = new_i;
j = new_j;
}
assert!(lengths_stack.is_empty());
(self, current_len)
}
}

View File

@@ -0,0 +1,9 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod builder;
mod store;
pub(crate) use builder::*;
pub(crate) use store::ConstArrayBuilder;

View File

@@ -0,0 +1,352 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains internal collections for the const builder.
use super::super::branch_meta::BranchMeta;
/// A const-friendly slice type. It is backed by a full slice but is primarily intended
/// to represent subslices of the full slice. We need this only because we can't take
/// subslices in const Rust.
#[derive(Debug, Copy, Clone)]
pub(crate) struct ConstSlice<'a, T> {
/// The full slice.
full_slice: &'a [T],
/// The start index of the slice represented by this [`ConstSlice`].
start: usize,
/// The non-inclusive end index of the slice represented by this [`ConstSlice`].
limit: usize,
}
impl<'a, T> ConstSlice<'a, T> {
/// Creates a [`ConstSlice`] representing an entire slice.
pub const fn from_slice(other: &'a [T]) -> Self {
ConstSlice {
full_slice: other,
start: 0,
limit: other.len(),
}
}
/// Creates a [`ConstSlice`] with the given start and limit.
pub const fn from_manual_slice(full_slice: &'a [T], start: usize, limit: usize) -> Self {
ConstSlice {
full_slice,
start,
limit,
}
}
/// Returns the length of the [`ConstSlice`].
pub const fn len(&self) -> usize {
self.limit - self.start
}
/// Gets the element at `index`, panicking if not present.
pub const fn get_or_panic(&self, index: usize) -> &T {
#[allow(clippy::indexing_slicing)] // documented
&self.full_slice[index + self.start]
}
/// Gets the first element or `None` if empty.
#[cfg(test)]
pub const fn first(&self) -> Option<&T> {
if self.len() == 0 {
None
} else {
// Won't panic: we already handled an empty slice
Some(self.get_or_panic(0))
}
}
/// Gets the last element or `None` if empty.
pub const fn last(&self) -> Option<&T> {
if self.len() == 0 {
None
} else {
// Won't panic: we already handled an empty slice
Some(self.get_or_panic(self.len() - 1))
}
}
/// Gets a subslice of this slice.
#[cfg(test)]
pub const fn get_subslice_or_panic(
&self,
new_start: usize,
new_limit: usize,
) -> ConstSlice<'a, T> {
assert!(new_start <= new_limit);
assert!(new_limit <= self.len());
ConstSlice {
full_slice: self.full_slice,
start: self.start + new_start,
limit: self.start + new_limit,
}
}
/// Non-const function that returns this [`ConstSlice`] as a regular slice.
#[cfg(any(test, feature = "alloc"))]
#[allow(clippy::indexing_slicing)] // indices in range by struct invariant
pub fn as_slice(&self) -> &'a [T] {
&self.full_slice[self.start..self.limit]
}
}
impl<'a, T> From<&'a [T]> for ConstSlice<'a, T> {
fn from(other: &'a [T]) -> Self {
Self::from_slice(other)
}
}
/// A const-friendly mutable data structure backed by an array.
#[derive(Debug, Copy, Clone)]
pub(crate) struct ConstArrayBuilder<const N: usize, T> {
full_array: [T; N],
start: usize,
limit: usize,
}
impl<const N: usize, T: Default> Default for ConstArrayBuilder<N, T> {
fn default() -> Self {
Self::new_empty([(); N].map(|_| Default::default()), 0)
}
}
impl<const N: usize, T> ConstArrayBuilder<N, T> {
/// Creates a new, empty builder of the given size. `cursor` indicates where in the
/// array new elements will be inserted first. Since we use a lot of prepend operations,
/// it is common to set `cursor` to `N`.
pub const fn new_empty(full_array: [T; N], cursor: usize) -> Self {
assert!(cursor <= N);
Self {
full_array,
start: cursor,
limit: cursor,
}
}
/// Creates a new builder with some initial content in `[start, limit)`.
pub const fn from_manual_slice(full_array: [T; N], start: usize, limit: usize) -> Self {
assert!(start <= limit);
assert!(limit <= N);
Self {
full_array,
start,
limit,
}
}
/// Returns the number of initialized elements in the builder.
pub const fn len(&self) -> usize {
self.limit - self.start
}
/// Whether there are no initialized elements in the builder.
#[allow(dead_code)]
pub const fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the initialized elements as a [`ConstSlice`].
pub const fn as_const_slice(&self) -> ConstSlice<'_, T> {
ConstSlice::from_manual_slice(&self.full_array, self.start, self.limit)
}
/// Non-const function that returns a slice of the initialized elements.
#[cfg(any(test, feature = "alloc"))]
pub fn as_slice(&self) -> &[T] {
&self.full_array[self.start..self.limit]
}
}
// Certain functions that involve dropping `T` require that it be `Copy`
impl<const N: usize, T: Copy> ConstArrayBuilder<N, T> {
/// Takes a fully initialized builder as an array. Panics if the builder is not
/// fully initialized.
pub const fn const_build_or_panic(self) -> [T; N] {
if self.start != 0 || self.limit != N {
let actual_len = self.limit - self.start;
const PREFIX: &[u8; 31] = b"Buffer too large. Size needed: ";
let len_bytes: [u8; PREFIX.len() + crate::helpers::MAX_USIZE_LEN_AS_DIGITS] =
crate::helpers::const_fmt_int(*PREFIX, actual_len);
let Ok(len_str) = core::str::from_utf8(&len_bytes) else {
unreachable!()
};
panic!("{}", len_str);
}
self.full_array
}
/// Prepends an element to the front of the builder, panicking if there is no room.
#[allow(clippy::indexing_slicing)] // documented
pub const fn const_push_front_or_panic(mut self, value: T) -> Self {
if self.start == 0 {
panic!("Buffer too small");
}
self.start -= 1;
self.full_array[self.start] = value;
self
}
/// Prepends multiple elements to the front of the builder, panicking if there is no room.
#[allow(clippy::indexing_slicing)] // documented
pub const fn const_extend_front_or_panic(mut self, other: ConstSlice<T>) -> Self {
if self.start < other.len() {
panic!("Buffer too small");
}
self.start -= other.len();
let mut i = self.start;
const_for_each!(other, byte, {
self.full_array[i] = *byte;
i += 1;
});
self
}
}
impl<const N: usize> ConstArrayBuilder<N, u8> {
/// Specialized function that performs `self[index] |= bits`
#[allow(clippy::indexing_slicing)] // documented
pub(crate) const fn const_bitor_assign_or_panic(mut self, index: usize, bits: u8) -> Self {
self.full_array[self.start + index] |= bits;
self
}
}
impl<const N: usize, T: Copy> ConstArrayBuilder<N, T> {
/// Swaps the elements at positions `i` and `j`.
#[cfg(feature = "alloc")]
pub fn swap_or_panic(mut self, i: usize, j: usize) -> Self {
self.full_array.swap(self.start + i, self.start + j);
self
}
}
/// Evaluates a block over each element of a const slice. Takes three arguments:
///
/// 1. Expression that resolves to the [`ConstSlice`].
/// 2. Token that will be assigned the value of the element.
/// 3. Block to evaluate for each element.
macro_rules! const_for_each {
($safe_const_slice:expr, $item:tt, $inner:expr) => {{
let mut i = 0;
while i < $safe_const_slice.len() {
// Won't panic: in-range loop condition
let $item = $safe_const_slice.get_or_panic(i);
$inner;
i += 1;
}
}};
}
pub(crate) use const_for_each;
/// A data structure that holds up to K [`BranchMeta`] items.
///
/// Note: It should be possible to store the required data in the builder buffer itself,
/// which would eliminate the need for this helper struct and the limit it imposes.
pub(crate) struct ConstLengthsStack<const K: usize> {
data: [Option<BranchMeta>; K],
idx: usize,
}
impl<const K: usize> core::fmt::Debug for ConstLengthsStack<K> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.as_slice().fmt(f)
}
}
impl<const K: usize> ConstLengthsStack<K> {
/// Creates a new empty [`ConstLengthsStack`].
pub const fn new() -> Self {
Self {
data: [None; K],
idx: 0,
}
}
/// Returns whether the stack is empty.
pub const fn is_empty(&self) -> bool {
self.idx == 0
}
/// Adds a [`BranchMeta`] to the stack, panicking if there is no room.
#[must_use]
#[allow(clippy::indexing_slicing)] // documented
pub const fn push_or_panic(mut self, meta: BranchMeta) -> Self {
if self.idx >= K {
panic!(concat!(
"AsciiTrie Builder: Need more stack (max ",
stringify!(K),
")"
));
}
self.data[self.idx] = Some(meta);
self.idx += 1;
self
}
/// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if
/// the stack is empty.
pub const fn peek_or_panic(&self) -> BranchMeta {
if self.idx == 0 {
panic!("AsciiTrie Builder: Attempted to peek from an empty stack");
}
self.get_or_panic(0)
}
/// Returns a copy of the [`BranchMeta`] at the specified index.
#[allow(clippy::indexing_slicing)] // documented
const fn get_or_panic(&self, index: usize) -> BranchMeta {
if self.idx <= index {
panic!("AsciiTrie Builder: Attempted to get too deep in a stack");
}
match self.data[self.idx - index - 1] {
Some(x) => x,
None => unreachable!(),
}
}
/// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`].
#[allow(clippy::indexing_slicing)] // documented
pub const fn pop_many_or_panic(
mut self,
len: usize,
) -> (Self, ConstArrayBuilder<256, BranchMeta>) {
debug_assert!(len <= 256);
let mut result = ConstArrayBuilder::new_empty([BranchMeta::default(); 256], 256);
let mut ix = 0;
loop {
if ix == len {
break;
}
let i = self.idx - ix - 1;
result = result.const_push_front_or_panic(match self.data[i] {
Some(x) => x,
None => panic!("Not enough items in the ConstLengthsStack"),
});
ix += 1;
}
self.idx -= len;
(self, result)
}
/// Non-const function that returns the initialized elements as a slice.
fn as_slice(&self) -> &[Option<BranchMeta>] {
&self.data[0..self.idx]
}
}
impl<const K: usize> ConstArrayBuilder<K, BranchMeta> {
/// Converts this builder-array of [`BranchMeta`] to one of the `ascii` fields.
pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder<K, u8> {
let mut result = ConstArrayBuilder::new_empty([0; K], K);
let self_as_slice = self.as_const_slice();
const_for_each!(self_as_slice, value, {
result = result.const_push_front_or_panic(value.ascii);
});
result
}
}

51
vendor/zerotrie/src/builder/litemap.rs vendored Normal file
View File

@@ -0,0 +1,51 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Impls for functions gated on the "litemap" feature.
use super::konst::*;
use crate::builder::bytestr::ByteStr;
use crate::error::ZeroTrieBuildError;
use crate::zerotrie::ZeroTrieSimpleAscii;
use crate::ZeroTrie;
use alloc::borrow::Borrow;
use alloc::vec::Vec;
use litemap::LiteMap;
impl ZeroTrieSimpleAscii<Vec<u8>> {
#[doc(hidden)]
pub fn try_from_litemap_with_const_builder<'a, S>(
items: &LiteMap<&'a [u8], usize, S>,
) -> Result<Self, ZeroTrieBuildError>
where
S: litemap::store::StoreSlice<&'a [u8], usize, Slice = [(&'a [u8], usize)]>,
{
let tuples = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
ZeroTrieBuilderConst::<10000>::from_sorted_const_tuple_slice::<100>(byte_str_slice.into())
.map(|s| Self {
store: s.as_bytes().to_vec(),
})
}
}
impl<K, S> TryFrom<&LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
where
// Borrow, not AsRef, because we rely on Ord being the same. Unfortunately
// this means `LiteMap<&str, usize>` does not work.
K: Borrow<[u8]>,
S: litemap::store::StoreSlice<K, usize, Slice = [(K, usize)]>,
{
type Error = ZeroTrieBuildError;
fn try_from(items: &LiteMap<K, usize, S>) -> Result<Self, ZeroTrieBuildError> {
let byte_litemap = items.to_borrowed_keys::<[u8], Vec<_>>();
let byte_slice = byte_litemap.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(byte_slice);
Self::try_from_tuple_slice(byte_str_slice)
}
}
// TODO(#7084): Make this more infallible by calculating the required length,
// heap-allocating the required capacity, and pointing ConstAsciiTrieBuilderStore
// to the heap buffer.

303
vendor/zerotrie/src/builder/mod.rs vendored Normal file
View File

@@ -0,0 +1,303 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! # ZeroTrie Builder
//!
//! There are two implementations of the ZeroTrie Builder:
//!
//! - [konst::ZeroTrieBuilderConst] allows for human-readable const construction
//! - [nonconst::ZeroTrieBuilder] has the full feaure set but requires `alloc`
//!
//! The two builders follow the same algorithm but have different capabilities.
//!
//! ## Builder Algorithm Overview
//!
//! The tries are built backwards, from the last node to the first node. The key step of the
//! algorithm is **determining what is the next node to prepend.**
//!
//! In the simple case of [`ZeroTrieSimpleAscii`], all nodes are binary-search, so if the input
//! strings are provided in lexicographic order, there is a simple, deterministic method for
//! identifying the next node. This insight is what enables us to make the const builder.
//!
//! The builder works with the following intermediate state variables:
//!
//! - `prefix_len` indicates the byte index we are currently processing.
//! - `i` and `j` bracket a window of strings in the input that share the same prefix.
//! - `current_len` is the length in bytes of the current self-contained trie.
//! - `lengths_stack` contains metadata for branch nodes.
//!
//! Consider a trie containing the following strings and values:
//!
//! - "" → 11
//! - "ad" → 22
//! - "adef" → 33
//! - "adghk" → 44
//!
//! Suppose `prefix_len = 2`, `i = 1`, and `j = 4`. This would indicate that we
//! have are evaluating the strings with the "ad" prefix, which extend from
//! index 1 (inclusive) to index 4 (exclusive).
//!
//! What follows is a verbal explanation of the build steps for the above trie.
//! When a node is prepended, it is shown in **boldface**.
//!
//! 1. Initialize the builder by setting `i=3`, `j=4`, `prefix_len=5` (the last string),
//! `current_len=0`, and `lengths_stack` empty. Start the main loop.
//! 2. Top of loop. The string at `i` is equal in length to `prefix_len`, so we prepend
//! our first node: a **value node 44**, which requires a 2-byte varint. Increase
//! `current_len` to 2.
//! 3. Reduce `prefix_len` to 4, read our `key_ascii="k"`, and recalculate `i` and `j`
//! _(this calculation is a long chunk of code in the builder impls)_. Since there is no
//! other string with the prefix "adgh", `i` and `j` stay the same, we prepend an
//! **ASCII node "k"**, increase `current_len` to 3, and continue the main loop.
//! 4. Top of loop. The string at `i` is of length 5, but `prefix_len` is 4, so there is
//! no value node to prepend.
//! 5. Reduce `prefix_len` to 3, read our `key_ascii="h"`, and recalculate `i` and `j`.
//! There are no other strings sharing the prefix "abg", so we prepend an
//! **ASCII node "h"**, increase `current_len` to 4, and continue the main loop.
//! 6. Top of loop. There is still no value node to prepend.
//! 7. Reduce `prefix_len` to 2, read our `key_ascii="g"`, and recalculate `i` and `j`.
//! We find that `i=1` and `j=4`, the range of strings sharing the prefix "ad". Since
//! `i` or `j` changed, proceed to evaluate the branch node.
//! 8. The last branch byte `ascii_j` for this prefix is "g", which is the same as `key_ascii`,
//! so we are the _last_ target of a branch node. Push an entry onto `lengths_stack`:
//! `BranchMeta { ascii: "g", cumulative_length: 4, local_length: 4, count: 1 }`.
//! 9. The first branch byte `ascii_i` for this prefix is "e", which is NOT equal to `key_ascii`,
//! so we are _not the first_ target of a branch node. We therefore start evaluating the
//! string preceding where we were at the top of the current loop. We set `i=2`, `j=3`,
//! `prefix_len=4` (length of the string at `i`), and continue the main loop.
//! 10. Top of loop. Since the string at `i` is equal in length to `prefix_len`, we prepend a
//! **value node 33** (which requires a 2-byte varint) and increase `current_len` to 2.
//! 11. Reduce `prefix_len` to 3, read our `key_ascii="f"`, and recalculate `i` and `j`.
//! They stay the same, so we prepend an **ASCII node "f"**, increase `current_len` to 3,
//! and continue the main loop.
//! 12. Top of loop. No value node this time.
//! 13. Reduce `prefix_len` to 2, read our `key_ascii="e"`, and recalculate `i` and `j`.
//! They go back to `i=1` and `j=4`.
//! 14. The last branch byte `ascii_j` for this prefix is "g", which is NOT equal to `key_ascii`,
//! so we are _not the last_ target of a branch node. We peek at the entry at the front of
//! the lengths stack and use it to push another entry onto the stack:
//! `BranchMeta { ascii: "e", cumulative_length: 7, local_length: 3, count: 2 }`
//! 15. The first branch byte `ascii_i` for this prefix is "e", which is the same as `key_ascii`,
//! wo we are the _first_ target of a branch node. We can therefore proceed to prepend the
//! metadata for the branch node. We peek at the top of the stack and find that there are 2
//! tries reachable from this branch and they have a total byte length of 5. We then pull off
//! 2 entries from the stack into a local variable `branch_metas`. From here, we write out
//! the **offset table**, **lookup table**, and **branch head node**, which are determined
//! from the metadata entries. We set `current_len` to the length of the two tries plus the
//! metadata, which happens to be 11. Then we return to the top of the main loop.
//! 16. Top of loop. The string at `i` is length 2, which is the same as `prefix_len`, so we
//! prepend a **value node 22** (2-byte varint) and increase `current_len` to 13.
//! 17. Reduce `prefix_len` to 1, read our `key_ascii="d"`, and recalculate `i` and `j`.
//! They stay the same, so we prepend an **ASCII node "d"**, increase `current_len` to 14,
//! and continue the main loop.
//! 18. Top of loop. No value node this time.
//! 19. Reduce `prefix_len` to 0, read our `key_ascii="a"`, and recalculate `i` and `j`.
//! They change to `i=0` and `j=4`, since all strings have the empty string as a prefix.
//! However, `ascii_i` and `ascii_j` both equal `key_ascii`, so we prepend **ASCII node "a"**,
//! increase `current_len` to 15, and continue the main loop.
//! 16. Top of loop. The string at `i` is length 0, which is the same as `prefix_len`, so we
//! prepend a **value node 11** and increase `current_len` to 16.
//! 17. We can no longer reduce `prefix_len`, so our trie is complete.
//!
//! ## Perfect Hash Reordering
//!
//! When the PHF is added to the mix, the main change is that the strings are no longer in sorted
//! order when they are in the trie. To resolve this issue, when adding a branch node, the target
//! tries are rearranged in-place in the buffer to be in the correct order for the PHF.
//!
//! ## Example
//!
//! Here is the output of the trie described above.
//!
//! ```
//! use zerotrie::ZeroTrieSimpleAscii;
//!
//! const DATA: [(&str, usize); 4] =
//! [("", 11), ("ad", 22), ("adef", 33), ("adghk", 44)];
//!
//! // As demonstrated above, the required capacity for this trie is 16 bytes
//! const TRIE: ZeroTrieSimpleAscii<[u8; 16]> =
//! ZeroTrieSimpleAscii::from_sorted_str_tuples(&DATA);
//!
//! assert_eq!(
//! TRIE.as_bytes(),
//! &[
//! 0x8B, // value node 11
//! b'a', // ASCII node 'a'
//! b'd', // ASCII node 'd'
//! 0x90, // value node 22 lead byte
//! 0x06, // value node 22 trail byte
//! 0xC2, // branch node 2
//! b'e', // first target of branch
//! b'g', // second target of branch
//! 3, // offset
//! b'f', // ASCII node 'f'
//! 0x90, // value node 33 lead byte
//! 0x11, // value node 33 trail byte
//! b'h', // ASCII node 'h'
//! b'k', // ASCII node 'k'
//! 0x90, // value node 44 lead byte
//! 0x1C, // value node 44 trail byte
//! ]
//! );
//!
//! assert_eq!(TRIE.get(b""), Some(11));
//! assert_eq!(TRIE.get(b"ad"), Some(22));
//! assert_eq!(TRIE.get(b"adef"), Some(33));
//! assert_eq!(TRIE.get(b"adghk"), Some(44));
//! assert_eq!(TRIE.get(b"unknown"), None);
//! ```
mod branch_meta;
pub(crate) mod bytestr;
pub(crate) mod konst;
#[cfg(feature = "litemap")]
mod litemap;
#[cfg(feature = "alloc")]
pub(crate) mod nonconst;
use bytestr::ByteStr;
use super::ZeroTrieSimpleAscii;
impl<const N: usize> ZeroTrieSimpleAscii<[u8; N]> {
/// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values.
///
/// This function needs to know the exact length of the resulting trie at compile time. To
/// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting
/// compile error which will tell you how to set `N`, like this:
///
/// > the evaluated program panicked at 'Buffer too large. Size needed: 17'
///
/// That error message says you need to set `N` to 17.
///
/// Also see [`Self::from_sorted_str_tuples`].
///
/// # Panics
///
/// Panics if `items` is not sorted or if `N` is not correct.
///
/// # Examples
///
/// Create a `const` ZeroTrieSimpleAscii at compile time:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // The required capacity for this trie happens to be 17 bytes
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> =
/// ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
///
/// assert_eq!(TRIE.get(b"foo"), Some(1));
/// assert_eq!(TRIE.get(b"bar"), Some(2));
/// assert_eq!(TRIE.get(b"bazzoo"), Some(3));
/// assert_eq!(TRIE.get(b"unknown"), None);
/// ```
///
/// Panics if strings are not sorted:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"foo", 1),
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// ]);
/// ```
///
/// Panics if capacity is too small:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 15]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
/// ```
///
/// Panics if capacity is too large:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 20]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
/// ```
pub const fn from_sorted_u8_tuples(tuples: &[(&[u8], usize)]) -> Self {
use konst::*;
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice);
match result {
Ok(s) => Self::from_store(s.build_or_panic()),
Err(_) => panic!("Failed to build ZeroTrie"),
}
}
/// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values.
///
/// This function needs to know the exact length of the resulting trie at compile time. To
/// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting
/// compile error which will tell you how to set `N`, like this:
///
/// > the evaluated program panicked at 'Buffer too large. Size needed: 17'
///
/// That error message says you need to set `N` to 17.
///
/// Also see [`Self::from_sorted_u8_tuples`].
///
/// # Panics
///
/// Panics if `items` is not sorted, if `N` is not correct, or if any of the strings contain
/// non-ASCII characters.
///
/// # Examples
///
/// Create a `const` ZeroTrieSimpleAscii at compile time:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // The required capacity for this trie happens to be 17 bytes
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> =
/// ZeroTrieSimpleAscii::from_sorted_str_tuples(&[
/// ("bar", 2),
/// ("bazzoo", 3),
/// ("foo", 1),
/// ]);
///
/// assert_eq!(TRIE.get(b"foo"), Some(1));
/// assert_eq!(TRIE.get(b"bar"), Some(2));
/// assert_eq!(TRIE.get(b"bazzoo"), Some(3));
/// assert_eq!(TRIE.get(b"unknown"), None);
/// ```
///
/// Panics if the strings are not ASCII:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 100]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[
/// ("bár", 2),
/// ("båzzöo", 3),
/// ("foo", 1),
/// ]);
/// ```
pub const fn from_sorted_str_tuples(tuples: &[(&str, usize)]) -> Self {
use konst::*;
let byte_str_slice = ByteStr::from_str_slice_with_value(tuples);
// 100 is the value of `K`, the size of the lengths stack. If compile errors are
// encountered, this number may need to be increased.
let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice);
match result {
Ok(s) => Self::from_store(s.build_or_panic()),
Err(_) => panic!("Failed to build ZeroTrie"),
}
}
}

View File

@@ -0,0 +1,420 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
use super::super::branch_meta::BranchMeta;
use super::store::NonConstLengthsStack;
use super::store::TrieBuilderStore;
use crate::builder::bytestr::ByteStr;
use crate::byte_phf::PerfectByteHashMapCacheOwned;
use crate::error::ZeroTrieBuildError;
use crate::options::*;
use crate::varint;
use alloc::borrow::Cow;
use alloc::vec::Vec;
/// A low-level builder for ZeroTrie. Supports all options.
pub(crate) struct ZeroTrieBuilder<S> {
data: S,
phf_cache: PerfectByteHashMapCacheOwned,
options: ZeroTrieBuilderOptions,
}
impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
/// Returns the trie data as a `Vec<u8>`.
pub fn to_bytes(&self) -> Vec<u8> {
self.data.atbs_to_bytes()
}
/// Prepends a byte value to the front of the builder. If it is ASCII, an ASCII
/// node is prepended. If it is non-ASCII, if there is already a span node at
/// the front, we modify the span node to add the new byte; otherwise, we create
/// a new span node. Returns the delta in length, which is either 1 or 2.
fn prepend_ascii(&mut self, ascii: u8) -> Result<usize, ZeroTrieBuildError> {
if ascii <= 127 {
self.data.atbs_push_front(ascii);
Ok(1)
} else if matches!(self.options.ascii_mode, AsciiMode::BinarySpans) {
if let Some(old_front) = self.data.atbs_pop_front() {
let old_byte_len = self.data.atbs_len() + 1;
if old_front & 0b11100000 == 0b10100000 {
// Extend an existing span
// Unwrap OK: there is a varint at this location in the buffer
#[expect(clippy::unwrap_used)]
let old_span_size =
varint::try_read_varint_meta3_from_tstore(old_front, &mut self.data)
.unwrap();
self.data.atbs_push_front(ascii);
let varint_array = varint::write_varint_meta3(old_span_size + 1);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b10100000);
let new_byte_len = self.data.atbs_len();
return Ok(new_byte_len - old_byte_len);
} else {
self.data.atbs_push_front(old_front);
}
}
// Create a new span
self.data.atbs_push_front(ascii);
self.data.atbs_push_front(0b10100001);
Ok(2)
} else {
Err(ZeroTrieBuildError::NonAsciiError)
}
}
/// Prepends a value node to the front of the builder. Returns the
/// delta in length, which depends on the size of the varint.
#[must_use]
fn prepend_value(&mut self, value: usize) -> usize {
let varint_array = varint::write_varint_meta3(value);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b10000000);
varint_array.len()
}
/// Prepends a branch node to the front of the builder. Returns the
/// delta in length, which depends on the size of the varint.
#[must_use]
fn prepend_branch(&mut self, value: usize) -> usize {
let varint_array = varint::write_varint_meta2(value);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b11000000);
varint_array.len()
}
/// Prepends multiple arbitrary bytes to the front of the builder. Returns the
/// delta in length, which is the length of the slice.
#[must_use]
fn prepend_slice(&mut self, s: &[u8]) -> usize {
self.data.atbs_extend_front(s);
s.len()
}
/// Builds a ZeroTrie from an iterator of bytes. It first collects and sorts the iterator.
pub fn from_bytes_iter<K: AsRef<[u8]>, I: IntoIterator<Item = (K, usize)>>(
iter: I,
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
let items = Vec::<(K, usize)>::from_iter(iter);
let mut items = items
.iter()
.map(|(k, v)| (k.as_ref(), *v))
.collect::<Vec<(&[u8], usize)>>();
items.sort_by(|a, b| cmp_keys_values(options, *a, *b));
let ascii_str_slice = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice);
Self::from_sorted_tuple_slice_impl(byte_str_slice, options)
}
/// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted,
/// except for a case-insensitive trie where the items are re-sorted.
///
/// # Panics
///
/// May panic if the items are not sorted.
pub fn from_sorted_tuple_slice(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
let mut items = Cow::Borrowed(items);
if matches!(options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// We need to re-sort the items with our custom comparator.
items.to_mut().sort_by(|a, b| {
cmp_keys_values(options, (a.0.as_bytes(), a.1), (b.0.as_bytes(), b.1))
});
}
Self::from_sorted_tuple_slice_impl(&items, options)
}
/// Internal constructor that does not re-sort the items.
fn from_sorted_tuple_slice_impl(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
#[allow(clippy::indexing_slicing)] // a debug assertion only
for ab in items.windows(2) {
debug_assert!(cmp_keys_values(
options,
(ab[0].0.as_bytes(), ab[0].1),
(ab[1].0.as_bytes(), ab[1].1)
)
.is_lt());
}
let mut result = Self {
data: S::atbs_new_empty(),
phf_cache: PerfectByteHashMapCacheOwned::new_empty(),
options,
};
let total_size = result.create(items)?;
debug_assert!(total_size == result.data.atbs_len());
Ok(result)
}
/// The actual builder algorithm. For an explanation, see [`crate::builder`].
#[expect(clippy::unwrap_used)] // lots of indexing, but all indexes should be in range
fn create(&mut self, all_items: &[(&ByteStr, usize)]) -> Result<usize, ZeroTrieBuildError> {
let mut prefix_len = match all_items.last() {
Some(x) => x.0.len(),
// Empty slice:
None => return Ok(0),
};
// Initialize the main loop to point at the last string.
let mut lengths_stack = NonConstLengthsStack::new();
let mut i = all_items.len() - 1;
let mut j = all_items.len();
let mut current_len = 0;
// Start the main loop.
loop {
let item_i = all_items.get(i).unwrap();
let item_j = all_items.get(j - 1).unwrap();
debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
// Check if we need to add a value node here.
if item_i.0.len() == prefix_len {
let len = self.prepend_value(item_i.1);
current_len += len;
}
if prefix_len == 0 {
// All done! Leave the main loop.
break;
}
// Reduce the prefix length by 1 and recalculate i and j.
prefix_len -= 1;
let mut new_i = i;
let mut new_j = j;
let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
debug_assert_eq!(ascii_i, ascii_j);
let key_ascii = ascii_i;
loop {
if new_i == 0 {
break;
}
let candidate = all_items.get(new_i - 1).unwrap().0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_i.0.prefix_eq(candidate, prefix_len) {
new_i -= 1;
} else {
break;
}
if candidate.len() == prefix_len {
// A string that equals the prefix does not take part in the branch node.
break;
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_i {
ascii_i = candidate;
}
}
loop {
if new_j == all_items.len() {
break;
}
let candidate = all_items.get(new_j).unwrap().0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_j.0.prefix_eq(candidate, prefix_len) {
new_j += 1;
} else {
break;
}
if candidate.len() == prefix_len {
unreachable!("A shorter string should be earlier in the sequence");
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_j {
ascii_j = candidate;
}
}
// If there are no different bytes at this prefix level, we can add an ASCII or Span
// node and then continue to the next iteration of the main loop.
if ascii_i == key_ascii && ascii_j == key_ascii {
let len = self.prepend_ascii(key_ascii)?;
current_len += len;
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase)
&& i == new_i + 2
{
// This can happen if two strings were picked up, each with a different case
return Err(ZeroTrieBuildError::MixedCase);
}
debug_assert!(
i == new_i || i == new_i + 1,
"only the exact prefix string can be picked up at this level: {key_ascii}"
);
i = new_i;
debug_assert_eq!(j, new_j);
continue;
}
// If i and j changed, we are a target of a branch node.
if ascii_j == key_ascii {
// We are the _last_ target of a branch node.
lengths_stack.push(BranchMeta {
ascii: key_ascii,
cumulative_length: current_len,
local_length: current_len,
count: 1,
});
} else {
// We are the _not the last_ target of a branch node.
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
lengths_stack.push(BranchMeta {
ascii: key_ascii,
cumulative_length: cumulative_length + current_len,
local_length: current_len,
count: count + 1,
});
}
if ascii_i != key_ascii {
// We are _not the first_ target of a branch node.
// Set the cursor to the previous string and continue the loop.
j = i;
i -= 1;
prefix_len = all_items.get(i).unwrap().0.len();
current_len = 0;
continue;
}
// Branch (first)
// std::println!("lengths_stack: {lengths_stack:?}");
let (total_length, total_count) = {
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
(cumulative_length, count)
};
let mut branch_metas = lengths_stack.pop_many_or_panic(total_count);
let original_keys = branch_metas.map_to_ascii_bytes();
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// Check to see if we have the same letter in two different cases
let mut seen_ascii_alpha = [false; 26];
for c in original_keys.as_const_slice().as_slice() {
if c.is_ascii_alphabetic() {
let i = (c.to_ascii_lowercase() - b'a') as usize;
#[allow(clippy::indexing_slicing)] // 26 letters
if seen_ascii_alpha[i] {
return Err(ZeroTrieBuildError::MixedCase);
} else {
seen_ascii_alpha[i] = true;
}
}
}
}
let use_phf = matches!(self.options.phf_mode, PhfMode::UsePhf);
let opt_phf_vec = if total_count > 15 && use_phf {
let phf_vec = self
.phf_cache
.try_get_or_insert(original_keys.as_const_slice().as_slice().to_vec())?;
// Put everything in order via bubble sort
// Note: branch_metas is stored in reverse order (0 = last element)
loop {
let mut l = total_count - 1;
let mut changes = 0;
let mut start = 0;
while l > 0 {
let a = *branch_metas.as_const_slice().get_or_panic(l);
let b = *branch_metas.as_const_slice().get_or_panic(l - 1);
let a_idx = phf_vec.keys().iter().position(|x| x == &a.ascii).unwrap();
let b_idx = phf_vec.keys().iter().position(|x| x == &b.ascii).unwrap();
if a_idx > b_idx {
// std::println!("{a:?} <=> {b:?} ({phf_vec:?})");
// This method call won't panic because the ranges are valid.
self.data.atbs_swap_ranges(
start,
start + a.local_length,
start + a.local_length + b.local_length,
);
branch_metas = branch_metas.swap_or_panic(l - 1, l);
start += b.local_length;
changes += 1;
// FIXME: fix the `length` field
} else {
start += a.local_length;
}
l -= 1;
}
if changes == 0 {
break;
}
}
Some(phf_vec)
} else {
None
};
// Write out the offset table
current_len = total_length;
const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
if w > 3 && matches!(self.options.capacity_mode, CapacityMode::Normal) {
return Err(ZeroTrieBuildError::CapacityExceeded);
}
let mut k = 0;
while k <= w {
self.data.atbs_prepend_n_zeros(total_count - 1);
current_len += total_count - 1;
let mut l = 0;
let mut length_to_write = 0;
while l < total_count {
let BranchMeta { local_length, .. } = *branch_metas
.as_const_slice()
.get_or_panic(total_count - l - 1);
let mut adjusted_length = length_to_write;
let mut m = 0;
while m < k {
adjusted_length >>= 8;
m += 1;
}
if l > 0 {
self.data.atbs_bitor_assign(l - 1, adjusted_length as u8);
}
l += 1;
length_to_write += local_length;
}
k += 1;
}
// Write out the lookup table
assert!(0 < total_count && total_count <= 256);
let branch_value = (w << 8) + (total_count & 0xff);
if let Some(phf_vec) = opt_phf_vec {
self.data.atbs_extend_front(phf_vec.as_bytes());
let phf_len = phf_vec.as_bytes().len();
let branch_len = self.prepend_branch(branch_value);
current_len += phf_len + branch_len;
} else {
let search_len = self.prepend_slice(original_keys.as_slice());
let branch_len = self.prepend_branch(branch_value);
current_len += search_len + branch_len;
}
i = new_i;
j = new_j;
}
assert!(lengths_stack.is_empty());
Ok(current_len)
}
}
fn cmp_keys_values(
options: ZeroTrieBuilderOptions,
a: (&[u8], usize),
b: (&[u8], usize),
) -> Ordering {
if matches!(options.case_sensitivity, CaseSensitivity::Sensitive) {
a.0.cmp(b.0)
} else {
let a_iter = a.0.iter().map(|x| x.to_ascii_lowercase());
let b_iter = b.0.iter().map(|x| x.to_ascii_lowercase());
Iterator::cmp(a_iter, b_iter)
}
.then_with(|| a.1.cmp(&b.1))
}

View File

@@ -0,0 +1,9 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod builder;
mod store;
pub(crate) use builder::*;
pub(crate) use store::TrieBuilderStore;

View File

@@ -0,0 +1,192 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains internal collections for the non-const builder.
use super::super::branch_meta::BranchMeta;
use super::super::konst::ConstArrayBuilder;
use alloc::collections::VecDeque;
use alloc::vec::Vec;
/// A trait applied to a data structure for building a ZeroTrie.
pub(crate) trait TrieBuilderStore {
/// Create a new empty store.
fn atbs_new_empty() -> Self;
/// Return the length in bytes of the store.
fn atbs_len(&self) -> usize;
/// Push a byte to the front of the store.
fn atbs_push_front(&mut self, byte: u8);
/// Push multiple bytes to the front of the store.
fn atbs_extend_front(&mut self, other: &[u8]);
/// Read the store into a `Vec<u8>`.
fn atbs_to_bytes(&self) -> Vec<u8>;
/// Perform the operation `self[index] |= bits`
fn atbs_bitor_assign(&mut self, index: usize, bits: u8);
/// Swap the adjacent ranges `self[start..mid]` and `self[mid..limit]`.
///
/// # Panics
///
/// Panics if the specified ranges are invalid.
fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize);
/// Remove and return the first element in the store, or `None` if empty.
fn atbs_pop_front(&mut self) -> Option<u8>;
/// Prepend `n` zeros to the front of the store.
fn atbs_prepend_n_zeros(&mut self, n: usize) {
let mut i = 0;
while i < n {
self.atbs_push_front(0);
i += 1;
}
}
}
impl TrieBuilderStore for VecDeque<u8> {
fn atbs_new_empty() -> Self {
VecDeque::new()
}
fn atbs_len(&self) -> usize {
self.len()
}
fn atbs_push_front(&mut self, byte: u8) {
self.push_front(byte);
}
fn atbs_extend_front(&mut self, other: &[u8]) {
self.reserve(other.len());
for b in other.iter().rev() {
self.push_front(*b);
}
}
fn atbs_to_bytes(&self) -> Vec<u8> {
let mut v = Vec::with_capacity(self.len());
let (a, b) = self.as_slices();
v.extend(a);
v.extend(b);
v
}
fn atbs_bitor_assign(&mut self, index: usize, bits: u8) {
self[index] |= bits;
}
/// # Panics
/// Panics if the specified ranges are invalid.
#[allow(clippy::panic)] // documented
fn atbs_swap_ranges(&mut self, mut start: usize, mut mid: usize, mut limit: usize) {
if start > mid || mid > limit {
panic!("Invalid args to atbs_swap_ranges(): start > mid || mid > limit");
}
if limit > self.len() {
panic!(
"Invalid args to atbs_swap_ranges(): limit out of range: {limit} > {}",
self.len()
);
}
// The following algorithm is an in-place swap of two adjacent ranges of potentially
// different lengths. Would make a good coding interview question.
loop {
if start == mid || mid == limit {
return;
}
let len0 = mid - start;
let len1 = limit - mid;
let mut i = start;
let mut j = limit - core::cmp::min(len0, len1);
while j < limit {
self.swap(i, j);
i += 1;
j += 1;
}
if len0 < len1 {
mid = start + len0;
limit -= len0;
} else {
start += len1;
mid = limit - len1;
}
}
}
fn atbs_pop_front(&mut self) -> Option<u8> {
self.pop_front()
}
}
/// A data structure that holds any number of [`BranchMeta`] items.
pub(crate) struct NonConstLengthsStack {
data: Vec<BranchMeta>,
}
impl core::fmt::Debug for NonConstLengthsStack {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.as_slice().fmt(f)
}
}
impl NonConstLengthsStack {
/// Creates a new empty [`NonConstLengthsStack`].
pub const fn new() -> Self {
Self { data: Vec::new() }
}
/// Returns whether the stack is empty.
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
/// Adds a [`BranchMeta`] to the stack.
pub fn push(&mut self, meta: BranchMeta) {
self.data.push(meta);
}
/// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if
/// the stack is empty.
#[allow(clippy::unwrap_used)] // "panic" is in the method name
pub fn peek_or_panic(&self) -> BranchMeta {
*self.data.last().unwrap()
}
/// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`].
pub fn pop_many_or_panic(&mut self, len: usize) -> ConstArrayBuilder<256, BranchMeta> {
debug_assert!(len <= 256);
let mut result = ConstArrayBuilder::new_empty([BranchMeta::default(); 256], 256);
let mut ix = 0;
loop {
if ix == len {
break;
}
let i = self.data.len() - ix - 1;
// Won't panic because len <= 256
result = result.const_push_front_or_panic(match self.data.get(i) {
Some(x) => *x,
None => unreachable!("Not enough items in the ConstLengthsStack"),
});
ix += 1;
}
self.data.truncate(self.data.len() - len);
result
}
/// Non-const function that returns the initialized elements as a slice.
fn as_slice(&self) -> &[BranchMeta] {
&self.data
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_swap_ranges() {
let s = b"..abcdefghijkl=";
let mut s = s.iter().copied().collect::<VecDeque<u8>>();
s.atbs_swap_ranges(2, 7, 14);
assert_eq!(s.atbs_to_bytes(), b"..fghijklabcde=");
}
}

214
vendor/zerotrie/src/byte_phf/builder.rs vendored Normal file
View File

@@ -0,0 +1,214 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::*;
use crate::error::ZeroTrieBuildError;
use alloc::vec;
use alloc::vec::Vec;
/// To speed up the search algorithm, we limit the number of times the level-2 parameter (q)
/// can hit its max value (initially Q_FAST_MAX) before we try the next level-1 parameter (p).
/// In practice, this has a small impact on the resulting perfect hash, resulting in about
/// 1 in 10000 hash maps that fall back to the slow path.
const MAX_L2_SEARCH_MISSES: usize = 24;
/// Directly compute the perfect hash function.
///
/// Returns `(p, [q_0, q_1, ..., q_(N-1)])`, or an error if the PHF could not be computed.
#[allow(unused_labels)] // for readability
#[allow(clippy::indexing_slicing)] // carefully reviewed to not panic
pub fn find(bytes: &[u8]) -> Result<(u8, Vec<u8>), ZeroTrieBuildError> {
let n_usize = bytes.len();
let mut p = 0u8;
let mut qq = vec![0u8; n_usize];
let mut bqs = vec![0u8; n_usize];
let mut seen = vec![false; n_usize];
let max_allowable_p = P_FAST_MAX;
let mut max_allowable_q = Q_FAST_MAX;
#[allow(non_snake_case)]
let N = if n_usize > 0 && n_usize < 256 {
n_usize as u8
} else {
debug_assert!(n_usize == 0 || n_usize == 256);
return Ok((p, qq));
};
'p_loop: loop {
// Vec of tuples: (index, bucket count)
let mut buckets: Vec<(usize, Vec<u8>)> = (0..n_usize).map(|i| (i, vec![])).collect();
for byte in bytes {
let l1 = f1(*byte, p, N) as usize;
buckets[l1].1.push(*byte);
}
buckets.sort_by_key(|(_, v)| -(v.len() as isize));
// println!("New P: p={p:?}, buckets={buckets:?}");
let mut i = 0;
let mut num_max_q = 0;
bqs.fill(0);
seen.fill(false);
'q_loop: loop {
// Loop condition: exit when i is beyond the buckets length
if i == buckets.len() {
for (local_j, real_j) in buckets.iter().map(|(j, _)| *j).enumerate() {
debug_assert!(local_j < n_usize); // comes from .enumerate()
debug_assert!(real_j < n_usize); // first item of bucket tuple is an index
qq[real_j] = bqs[local_j];
}
// println!("Success: p={p:?}, num_max_q={num_max_q:?}, bqs={bqs:?}, qq={qq:?}");
// if num_max_q > 0 {
// println!("num_max_q={num_max_q:?}");
// }
return Ok((p, qq));
}
let mut bucket = buckets[i].1.as_slice();
'byte_loop: for (j, byte) in bucket.iter().enumerate() {
let l2 = f2(*byte, bqs[i], N) as usize;
if seen[l2] {
// println!("Skipping Q: p={p:?}, i={i:?}, byte={byte:}, q={i:?}, l2={:?}", f2(*byte, bqs[i], N));
for k_byte in &bucket[0..j] {
let l2 = f2(*k_byte, bqs[i], N) as usize;
assert!(seen[l2]);
seen[l2] = false;
}
'reset_loop: loop {
if bqs[i] < max_allowable_q {
bqs[i] += 1;
continue 'q_loop;
}
num_max_q += 1;
bqs[i] = 0;
if i == 0 || num_max_q > MAX_L2_SEARCH_MISSES {
if p == max_allowable_p && max_allowable_q != Q_REAL_MAX {
// println!("Could not solve fast function: trying again: {bytes:?}");
max_allowable_q = Q_REAL_MAX;
p = 0;
continue 'p_loop;
} else if p == max_allowable_p {
// If a fallback algorithm for `p` is added, relax this assertion
// and re-run the loop with a higher `max_allowable_p`.
debug_assert_eq!(max_allowable_p, P_REAL_MAX);
// println!("Could not solve PHF function");
return Err(ZeroTrieBuildError::CouldNotSolvePerfectHash);
} else {
p += 1;
continue 'p_loop;
}
}
i -= 1;
bucket = buckets[i].1.as_slice();
for byte in bucket {
let l2 = f2(*byte, bqs[i], N) as usize;
assert!(seen[l2]);
seen[l2] = false;
}
}
} else {
// println!("Marking as seen: i={i:?}, byte={byte:}, l2={:?}", f2(*byte, bqs[i], N));
let l2 = f2(*byte, bqs[i], N) as usize;
seen[l2] = true;
}
}
// println!("Found Q: i={i:?}, q={:?}", bqs[i]);
i += 1;
}
}
}
impl PerfectByteHashMap<Vec<u8>> {
/// Computes a new [`PerfectByteHashMap`].
///
/// (this is a doc-hidden API)
#[allow(clippy::indexing_slicing)] // carefully reviewed to not panic
pub fn try_new(keys: &[u8]) -> Result<Self, ZeroTrieBuildError> {
let n_usize = keys.len();
let n = n_usize as u8;
let (p, mut qq) = find(keys)?;
let mut keys_permuted = vec![0; n_usize];
for key in keys {
let l1 = f1(*key, p, n) as usize;
let q = qq[l1];
let l2 = f2(*key, q, n) as usize;
keys_permuted[l2] = *key;
}
let mut result = Vec::with_capacity(n_usize * 2 + 1);
result.push(p);
result.append(&mut qq);
result.append(&mut keys_permuted);
Ok(Self(result))
}
}
#[cfg(test)]
mod tests {
use super::*;
extern crate std;
use std::print;
use std::println;
fn print_byte_to_stdout(byte: u8) {
let c = char::from(byte);
if c.is_ascii_alphanumeric() {
print!("'{c}'");
} else {
print!("0x{byte:X}");
}
}
fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
use rand::seq::SliceRandom;
use rand::SeedableRng;
let mut bytes: Vec<u8> =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789".into();
let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
bytes.partial_shuffle(&mut rng, len).0.into()
}
#[test]
fn test_random_distributions() {
let mut p_distr = vec![0; 256];
let mut q_distr = vec![0; 256];
for len in 0..50 {
for seed in 0..50 {
let bytes = random_alphanums(seed, len);
let (p, qq) = find(bytes.as_slice()).unwrap();
p_distr[p as usize] += 1;
for q in qq {
q_distr[q as usize] += 1;
}
}
}
println!("p_distr: {p_distr:?}");
println!("q_distr: {q_distr:?}");
let fast_p = p_distr[0..=P_FAST_MAX as usize].iter().sum::<usize>();
let slow_p = p_distr[(P_FAST_MAX + 1) as usize..].iter().sum::<usize>();
let fast_q = q_distr[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let slow_q = q_distr[(Q_FAST_MAX + 1) as usize..].iter().sum::<usize>();
assert_eq!(2500, fast_p);
assert_eq!(0, slow_p);
assert_eq!(61243, fast_q);
assert_eq!(7, slow_q);
let bytes = random_alphanums(0, 16);
#[allow(non_snake_case)]
let N = u8::try_from(bytes.len()).unwrap();
let (p, qq) = find(bytes.as_slice()).unwrap();
println!("Results:");
for byte in bytes.iter() {
print_byte_to_stdout(*byte);
let l1 = f1(*byte, p, N) as usize;
let q = qq[l1];
let l2 = f2(*byte, q, N) as usize;
println!(" => l1 {l1} => q {q} => l2 {l2}");
}
}
}

View File

@@ -0,0 +1,39 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::*;
use crate::error::ZeroTrieBuildError;
use alloc::collections::btree_map::Entry;
use alloc::collections::BTreeMap;
use alloc::vec::Vec;
/// Helper class for caching the results of multiple [`PerfectByteHashMap`] calculations.
pub struct PerfectByteHashMapCacheOwned {
// Note: This should probably be a HashMap but that isn't in `alloc`
data: BTreeMap<Vec<u8>, PerfectByteHashMap<Vec<u8>>>,
}
impl PerfectByteHashMapCacheOwned {
/// Creates a new empty instance.
pub fn new_empty() -> Self {
Self {
data: BTreeMap::new(),
}
}
/// Gets the [`PerfectByteHashMap`] for the given bytes, calculating it if necessary.
pub fn try_get_or_insert(
&mut self,
keys: Vec<u8>,
) -> Result<&PerfectByteHashMap<[u8]>, ZeroTrieBuildError> {
let mut_phf = match self.data.entry(keys) {
Entry::Vacant(entry) => {
let value = PerfectByteHashMap::try_new(entry.key())?;
entry.insert(value)
}
Entry::Occupied(entry) => entry.into_mut(),
};
Ok(mut_phf.as_borrowed())
}
}

485
vendor/zerotrie/src/byte_phf/mod.rs vendored Normal file
View File

@@ -0,0 +1,485 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![allow(rustdoc::private_intra_doc_links)] // doc(hidden) module
//! # Byte Perfect Hash Function Internals
//!
//! This module contains a perfect hash function (PHF) designed for a fast, compact perfect
//! hash over 1 to 256 nodes (bytes).
//!
//! The PHF uses the following variables:
//!
//! 1. A single parameter `p`, which is 0 in about 98% of cases.
//! 2. A list of `N` parameters `q_t`, one per _bucket_
//! 3. The `N` keys in an arbitrary order determined by the PHF
//!
//! Reading a `key` from the PHF uses the following algorithm:
//!
//! 1. Let `t`, the bucket index, be `f1(key, p)`.
//! 2. Let `i`, the key index, be `f2(key, q_t)`.
//! 3. If `key == k_i`, return `Some(i)`; else return `None`.
//!
//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across
//! serialization versions of `ZeroTrie`. They are very fast, constant-time operations as long
//! as `p` <= [`P_FAST_MAX`] and `q` <= [`Q_FAST_MAX`]. In practice, nearly 100% of parameter
//! values are in the fast range.
//!
//! ```
//! use zerotrie::_internal::PerfectByteHashMap;
//!
//! let phf_example_bytes = [
//! // `p` parameter
//! 1, // `q` parameters, one for each of the N buckets
//! 0, 0, 1, 1, // Exact keys to be compared with the input
//! b'e', b'a', b'c', b'g',
//! ];
//!
//! let phf = PerfectByteHashMap::from_bytes(&phf_example_bytes);
//!
//! // The PHF returns the index of the key or `None` if not found.
//! assert_eq!(phf.get(b'a'), Some(1));
//! assert_eq!(phf.get(b'b'), None);
//! assert_eq!(phf.get(b'c'), Some(2));
//! assert_eq!(phf.get(b'd'), None);
//! assert_eq!(phf.get(b'e'), Some(0));
//! assert_eq!(phf.get(b'f'), None);
//! assert_eq!(phf.get(b'g'), Some(3));
//! ```
use crate::helpers::*;
#[cfg(feature = "alloc")]
mod builder;
#[cfg(feature = "alloc")]
mod cached_owned;
#[cfg(feature = "alloc")]
pub use cached_owned::PerfectByteHashMapCacheOwned;
/// The cutoff for the fast version of [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_FAST_MAX: u8 = 95;
/// The cutoff for the fast version of [`f2`].
const Q_FAST_MAX: u8 = 95;
/// The maximum allowable value of `p`. This could be raised if found to be necessary.
/// Values exceeding P_FAST_MAX could use a different `p` algorithm by modifying [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_REAL_MAX: u8 = P_FAST_MAX;
/// The maximum allowable value of `q`. This could be raised if found to be necessary.
#[cfg(feature = "alloc")] // used in the builder code
const Q_REAL_MAX: u8 = 127;
/// Calculates the function `f1` for the PHF. For the exact formula, please read the code.
///
/// When `p == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f1;
/// const N: u8 = 10;
///
/// // With p = 0:
/// assert_eq!(0, f1(0, 0, N));
/// assert_eq!(1, f1(1, 0, N));
/// assert_eq!(2, f1(2, 0, N));
/// assert_eq!(9, f1(9, 0, N));
/// assert_eq!(0, f1(10, 0, N));
/// assert_eq!(1, f1(11, 0, N));
/// assert_eq!(2, f1(12, 0, N));
/// assert_eq!(9, f1(19, 0, N));
///
/// // With p = 1:
/// assert_eq!(1, f1(0, 1, N));
/// assert_eq!(0, f1(1, 1, N));
/// assert_eq!(2, f1(2, 1, N));
/// assert_eq!(2, f1(9, 1, N));
/// assert_eq!(4, f1(10, 1, N));
/// assert_eq!(5, f1(11, 1, N));
/// assert_eq!(1, f1(12, 1, N));
/// assert_eq!(7, f1(19, 1, N));
/// ```
#[inline]
pub fn f1(byte: u8, p: u8, n: u8) -> u8 {
if n == 0 {
byte
} else if p == 0 {
byte % n
} else {
// `p` always uses the below constant-time operation. If needed, we
// could add some other operation here with `p > P_FAST_MAX` to solve
// difficult cases if the need arises.
let result = byte ^ p ^ byte.wrapping_shr(p as u32);
result % n
}
}
/// Calculates the function `f2` for the PHF. For the exact formula, please read the code.
///
/// When `q == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f2;
/// const N: u8 = 10;
///
/// // With q = 0:
/// assert_eq!(0, f2(0, 0, N));
/// assert_eq!(1, f2(1, 0, N));
/// assert_eq!(2, f2(2, 0, N));
/// assert_eq!(9, f2(9, 0, N));
/// assert_eq!(0, f2(10, 0, N));
/// assert_eq!(1, f2(11, 0, N));
/// assert_eq!(2, f2(12, 0, N));
/// assert_eq!(9, f2(19, 0, N));
///
/// // With q = 1:
/// assert_eq!(1, f2(0, 1, N));
/// assert_eq!(0, f2(1, 1, N));
/// assert_eq!(3, f2(2, 1, N));
/// assert_eq!(8, f2(9, 1, N));
/// assert_eq!(1, f2(10, 1, N));
/// assert_eq!(0, f2(11, 1, N));
/// assert_eq!(3, f2(12, 1, N));
/// assert_eq!(8, f2(19, 1, N));
/// ```
#[inline]
pub fn f2(byte: u8, q: u8, n: u8) -> u8 {
if n == 0 {
return byte;
}
let mut result = byte ^ q;
// In almost all cases, the PHF works with the above constant-time operation.
// However, to crack a few difficult cases, we fall back to the linear-time
// operation shown below.
for _ in Q_FAST_MAX..q {
result = result ^ (result << 1) ^ (result >> 1);
}
result % n
}
/// A constant-time map from bytes to unique indices.
///
/// Uses a perfect hash function (see module-level documentation). Does not support mutation.
///
/// Standard layout: P, N bytes of Q, N bytes of expected keys
#[derive(Debug, PartialEq, Eq)]
#[repr(transparent)]
pub struct PerfectByteHashMap<Store: ?Sized>(Store);
impl<Store> PerfectByteHashMap<Store> {
/// Creates an instance from a pre-existing store. See [`Self::as_bytes`].
#[inline]
pub fn from_store(store: Store) -> Self {
Self(store)
}
}
impl<Store> PerfectByteHashMap<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets the usize for the given byte, or `None` if it is not in the map.
pub fn get(&self, key: u8) -> Option<usize> {
let (p, buffer) = self.0.as_ref().split_first()?;
// Note: there are N buckets followed by N keys
let n_usize = buffer.len() / 2;
if n_usize == 0 {
return None;
}
let n = n_usize as u8;
let (qq, eks) = buffer.debug_split_at(n_usize);
debug_assert_eq!(qq.len(), eks.len());
let l1 = f1(key, *p, n) as usize;
let q = debug_unwrap!(qq.get(l1), return None);
let l2 = f2(key, *q, n) as usize;
let ek = debug_unwrap!(eks.get(l2), return None);
if *ek == key {
Some(l2)
} else {
None
}
}
/// This is called `num_items` because `len` is ambiguous: it could refer
/// to the number of items or the number of bytes.
pub fn num_items(&self) -> usize {
self.0.as_ref().len() / 2
}
/// Get an iterator over the keys in the order in which they are stored in the map.
pub fn keys(&self) -> &[u8] {
let n = self.num_items();
self.0.as_ref().debug_split_at(1 + n).1
}
/// Diagnostic function that returns `p` and the maximum value of `q`
#[cfg(test)]
pub fn p_qmax(&self) -> Option<(u8, u8)> {
let (p, buffer) = self.0.as_ref().split_first()?;
let n = buffer.len() / 2;
if n == 0 {
return None;
}
let (qq, _) = buffer.debug_split_at(n);
Some((*p, *qq.iter().max().unwrap()))
}
/// Returns the map as bytes. The map can be recovered with [`Self::from_store`]
/// or [`Self::from_bytes`].
pub fn as_bytes(&self) -> &[u8] {
self.0.as_ref()
}
#[cfg(all(feature = "alloc", test))]
pub(crate) fn check(&self) -> Result<(), (&'static str, u8)> {
use alloc::vec;
let len = self.num_items();
let mut seen = vec![false; len];
for b in 0..=255u8 {
let get_result = self.get(b);
if self.keys().contains(&b) {
let i = get_result.ok_or(("expected to find", b))?;
if seen[i] {
return Err(("seen", b));
}
seen[i] = true;
} else if get_result.is_some() {
return Err(("did not expect to find", b));
}
}
Ok(())
}
}
impl PerfectByteHashMap<[u8]> {
/// Creates an instance from pre-existing bytes. See [`Self::as_bytes`].
#[inline]
pub fn from_bytes(bytes: &[u8]) -> &Self {
// Safety: Self is repr(transparent) over [u8]
unsafe { core::mem::transmute(bytes) }
}
}
impl<Store> PerfectByteHashMap<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Converts from `PerfectByteHashMap<AsRef<[u8]>>` to `&PerfectByteHashMap<[u8]>`
#[inline]
pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> {
PerfectByteHashMap::from_bytes(self.0.as_ref())
}
}
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::*;
use alloc::vec::Vec;
extern crate std;
fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
use rand::seq::SliceRandom;
use rand::SeedableRng;
let mut bytes: Vec<u8> =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789".into();
let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
bytes.partial_shuffle(&mut rng, len).0.into()
}
#[test]
fn test_smaller() {
let mut count_by_p = [0; 256];
let mut count_by_qmax = [0; 256];
for len in 1..16 {
for seed in 0..150 {
let keys = random_alphanums(seed, len);
let keys_str = core::str::from_utf8(&keys).unwrap();
let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
computed
.check()
.unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
let (p, qmax) = computed.p_qmax().unwrap();
count_by_p[p as usize] += 1;
count_by_qmax[qmax as usize] += 1;
}
}
std::println!("count_by_p (smaller): {count_by_p:?}");
std::println!("count_by_qmax (smaller): {count_by_qmax:?}");
let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
.iter()
.sum::<usize>();
std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
// Assert that 99% of cases resolve to the fast hash
assert!(count_fastq >= count_slowq * 100);
}
#[test]
fn test_larger() {
let mut count_by_p = [0; 256];
let mut count_by_qmax = [0; 256];
for len in 16..60 {
for seed in 0..75 {
let keys = random_alphanums(seed, len);
let keys_str = core::str::from_utf8(&keys).unwrap();
let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
computed
.check()
.unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
let (p, qmax) = computed.p_qmax().unwrap();
count_by_p[p as usize] += 1;
count_by_qmax[qmax as usize] += 1;
}
}
std::println!("count_by_p (larger): {count_by_p:?}");
std::println!("count_by_qmax (larger): {count_by_qmax:?}");
let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
.iter()
.sum::<usize>();
std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
// Assert that 99% of cases resolve to the fast hash
assert!(count_fastq >= count_slowq * 100);
}
#[test]
fn test_hard_cases() {
let keys = [
0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 195, 196,
];
let computed = PerfectByteHashMap::try_new(&keys).unwrap();
let (p, qmax) = computed.p_qmax().unwrap();
assert_eq!(p, 69);
assert_eq!(qmax, 67);
}
#[test]
fn test_build_read_small() {
#[derive(Debug)]
struct TestCase<'a> {
keys: &'a str,
expected: &'a [u8],
reordered_keys: &'a str,
}
let cases = [
TestCase {
keys: "ab",
expected: &[0, 0, 0, b'b', b'a'],
reordered_keys: "ba",
},
TestCase {
keys: "abc",
expected: &[0, 0, 0, 0, b'c', b'a', b'b'],
reordered_keys: "cab",
},
TestCase {
// Note: splitting "a" and "c" into different buckets requires the heavier hash
// function because the difference between "a" and "c" is the period (2).
keys: "ac",
expected: &[1, 0, 1, b'c', b'a'],
reordered_keys: "ca",
},
TestCase {
keys: "aceg",
expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'],
reordered_keys: "eacg",
},
TestCase {
keys: "abd",
expected: &[0, 0, 1, 3, b'a', b'b', b'd'],
reordered_keys: "abd",
},
TestCase {
keys: "def",
expected: &[0, 0, 0, 0, b'f', b'd', b'e'],
reordered_keys: "fde",
},
TestCase {
keys: "fi",
expected: &[0, 0, 0, b'f', b'i'],
reordered_keys: "fi",
},
TestCase {
keys: "gh",
expected: &[0, 0, 0, b'h', b'g'],
reordered_keys: "hg",
},
TestCase {
keys: "lm",
expected: &[0, 0, 0, b'l', b'm'],
reordered_keys: "lm",
},
TestCase {
// Note: "a" and "q" (0x61 and 0x71) are very hard to split; only a handful of
// hash function crates can get them into separate buckets.
keys: "aq",
expected: &[4, 0, 1, b'a', b'q'],
reordered_keys: "aq",
},
TestCase {
keys: "xy",
expected: &[0, 0, 0, b'x', b'y'],
reordered_keys: "xy",
},
TestCase {
keys: "xyz",
expected: &[0, 0, 0, 0, b'x', b'y', b'z'],
reordered_keys: "xyz",
},
TestCase {
keys: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
expected: &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 16,
16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
2, 0, 7, 104, 105, 106, 107, 108, 109, 110, 111, 112, 117, 118, 119, 68, 69,
70, 113, 114, 65, 66, 67, 120, 121, 122, 115, 72, 73, 74, 71, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 75, 76, 77, 78, 79, 103, 97, 98, 99, 116, 100, 102,
101,
],
reordered_keys: "hijklmnopuvwDEFqrABCxyzsHIJGPQRSTUVWXYZKLMNOgabctdfe",
},
TestCase {
keys: "abcdefghij",
expected: &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 97, 98, 99,
],
reordered_keys: "defghijabc",
},
TestCase {
// This is a small case that resolves to the slow hasher
keys: "Jbej",
expected: &[2, 0, 0, 102, 0, b'j', b'e', b'b', b'J'],
reordered_keys: "jebJ",
},
TestCase {
// This is another small case that resolves to the slow hasher
keys: "JFNv",
expected: &[1, 98, 0, 2, 0, b'J', b'F', b'N', b'v'],
reordered_keys: "JFNv",
},
];
for cas in cases {
let computed = PerfectByteHashMap::try_new(cas.keys.as_bytes()).expect(cas.keys);
assert_eq!(computed.as_bytes(), cas.expected, "{cas:?}");
assert_eq!(computed.keys(), cas.reordered_keys.as_bytes(), "{cas:?}");
computed.check().expect(cas.keys);
}
}
}

491
vendor/zerotrie/src/cursor.rs vendored Normal file
View File

@@ -0,0 +1,491 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Types for walking stepwise through a trie.
//!
//! For examples, see the `.cursor()` functions
//! and the `Cursor` types in this module.
use crate::reader;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrieSimpleAscii;
use core::fmt;
impl<Store> ZeroTrieSimpleAscii<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets a cursor into the current trie.
///
/// Useful to query a trie with data that is not a slice.
///
/// This is currently supported only on [`ZeroTrieSimpleAscii`]
/// and [`ZeroAsciiIgnoreCaseTrie`].
///
/// # Examples
///
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Get out the value for "abc"
/// let mut cursor = trie.cursor();
/// write!(&mut cursor, "abc");
/// assert_eq!(cursor.take_value(), Some(0));
/// ```
///
/// Find the longest prefix match:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Find the longest prefix of the string "abcdxy":
/// let query = b"abcdxy";
/// let mut longest_prefix = 0;
/// let mut cursor = trie.cursor();
/// for (i, b) in query.iter().enumerate() {
/// // Checking is_empty() is not required, but it is
/// // good for efficiency
/// if cursor.is_empty() {
/// break;
/// }
/// if cursor.take_value().is_some() {
/// longest_prefix = i;
/// }
/// cursor.step(*b);
/// }
///
/// // The longest prefix is "abc" which is length 3:
/// assert_eq!(longest_prefix, 3);
/// ```
#[inline]
pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor<'_> {
ZeroTrieSimpleAsciiCursor {
trie: self.as_borrowed_slice(),
}
}
}
impl<Store> ZeroAsciiIgnoreCaseTrie<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets a cursor into the current trie.
///
/// Useful to query a trie with data that is not a slice.
///
/// This is currently supported only on [`ZeroTrieSimpleAscii`]
/// and [`ZeroAsciiIgnoreCaseTrie`].
///
/// # Examples
///
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// // A trie with two values: "aBc" and "aBcdEf"
/// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
///
/// // Get out the value for "abc" (case-insensitive!)
/// let mut cursor = trie.cursor();
/// write!(&mut cursor, "abc");
/// assert_eq!(cursor.take_value(), Some(0));
/// ```
///
/// For more examples, see [`ZeroTrieSimpleAscii::cursor`].
#[inline]
pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor<'_> {
ZeroAsciiIgnoreCaseTrieCursor {
trie: self.as_borrowed_slice(),
}
}
}
impl<'a> ZeroTrieSimpleAscii<&'a [u8]> {
/// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid
/// having to doubly anchor the trie to the stack.
#[inline]
pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> {
ZeroTrieSimpleAsciiCursor { trie: self }
}
}
impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> {
/// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid
/// having to doubly anchor the trie to the stack.
#[inline]
pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> {
ZeroAsciiIgnoreCaseTrieCursor { trie: self }
}
}
/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup.
///
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroTrieSimpleAsciiCursor<'a> {
trie: ZeroTrieSimpleAscii<&'a [u8]>,
}
/// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup.
///
/// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
}
/// Information about a probed edge.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive] // no need to destructure or construct this in userland
pub struct AsciiProbeResult {
/// The character's byte value between this node and its parent.
pub byte: u8,
/// The number of siblings of this node, _including itself_.
pub total_siblings: u8,
}
impl ZeroTrieSimpleAsciiCursor<'_> {
/// Steps the cursor one character into the trie based on the character's byte value.
///
/// # Examples
///
/// Unrolled loop checking for string presence at every step:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Search the trie for the string "abcdxy"
/// let mut cursor = trie.cursor();
/// assert_eq!(cursor.take_value(), None); // ""
/// cursor.step(b'a');
/// assert_eq!(cursor.take_value(), None); // "a"
/// cursor.step(b'b');
/// assert_eq!(cursor.take_value(), None); // "ab"
/// cursor.step(b'c');
/// assert_eq!(cursor.take_value(), Some(0)); // "abc"
/// cursor.step(b'd');
/// assert_eq!(cursor.take_value(), None); // "abcd"
/// assert!(!cursor.is_empty());
/// cursor.step(b'x'); // no strings have the prefix "abcdx"
/// assert!(cursor.is_empty());
/// assert_eq!(cursor.take_value(), None); // "abcdx"
/// cursor.step(b'y');
/// assert_eq!(cursor.take_value(), None); // "abcdxy"
/// ```
///
/// If the byte is not ASCII, the cursor will become empty:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// assert_eq!(cursor.take_value(), None); // ""
/// cursor.step(b'a');
/// assert_eq!(cursor.take_value(), None); // "a"
/// cursor.step(b'b');
/// assert_eq!(cursor.take_value(), None); // "ab"
/// cursor.step(b'\xFF');
/// assert!(cursor.is_empty());
/// assert_eq!(cursor.take_value(), None);
/// ```
#[inline]
pub fn step(&mut self, byte: u8) {
reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte);
}
/// Takes the value at the current position.
///
/// Calling this function on a new cursor is equivalent to calling `.get()`
/// with the empty string (except that it can only be called once).
///
/// # Examples
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "" and "abc"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81");
///
/// assert_eq!(Some(0), trie.get(""));
/// let mut cursor = trie.cursor();
/// assert_eq!(Some(0), cursor.take_value());
/// assert_eq!(None, cursor.take_value());
/// ```
#[inline]
pub fn take_value(&mut self) -> Option<usize> {
reader::take_value(&mut self.trie.store)
}
/// Steps the cursor one character into the trie based on an edge index,
/// returning the corresponding character as a byte.
///
/// This function is similar to [`Self::step()`], but it takes an index instead of a char.
/// This enables stepwise iteration over the contents of the trie.
///
/// If there are multiple possibilities for the next byte, the `index` argument allows
/// visiting them in order. Since this function steps the cursor, the cursor must be
/// cloned (a cheap operation) in order to visit multiple children.
///
/// # Examples
///
/// Continually query index 0 to extract the first item from a trie:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// let data: &[(String, usize)] = &[
/// ("ab".to_string(), 111),
/// ("abcxyz".to_string(), 22),
/// ("abde".to_string(), 333),
/// ("afg".to_string(), 44),
/// ];
///
/// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
/// data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
///
/// let mut cursor = trie.cursor();
/// let mut key = String::new();
/// let value = loop {
/// if let Some(value) = cursor.take_value() {
/// break value;
/// }
/// let probe_result = cursor.probe(0).unwrap();
/// key.push(char::from(probe_result.byte));
/// };
///
/// assert_eq!(key, "ab");
/// assert_eq!(value, 111);
/// ```
///
/// Stepwise iterate over all entries in the trie:
///
/// ```
/// # use zerotrie::ZeroTrieSimpleAscii;
/// # let data: &[(String, usize)] = &[
/// # ("ab".to_string(), 111),
/// # ("abcxyz".to_string(), 22),
/// # ("abde".to_string(), 333),
/// # ("afg".to_string(), 44)
/// # ];
/// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
/// # .iter()
/// # .map(|(s, v)| (s.as_str(), *v))
/// # .collect();
/// // (trie built as in previous example)
///
/// // Initialize the iteration at the first child of the trie.
/// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
/// let mut key = Vec::new();
/// let mut results = Vec::new();
/// loop {
/// let Some((mut cursor, index, suffix_len)) = stack.pop() else {
/// // Nothing left in the trie.
/// break;
/// };
/// // Check to see if there is a value at the current node.
/// if let Some(value) = cursor.take_value() {
/// results.push((String::from_utf8(key.clone()).unwrap(), value));
/// }
/// // Now check for children of the current node.
/// let mut sub_cursor = cursor.clone();
/// if let Some(probe_result) = sub_cursor.probe(index) {
/// // Found a child. Add the current byte edge to the key.
/// key.push(probe_result.byte);
/// // Add the child to the stack, and also add back the current
/// // node if there are more siblings to visit.
/// if index + 1 < probe_result.total_siblings as usize {
/// stack.push((cursor, index + 1, suffix_len));
/// stack.push((sub_cursor, 0, 1));
/// } else {
/// stack.push((sub_cursor, 0, suffix_len + 1));
/// }
/// } else {
/// // No more children. Pop this node's bytes from the key.
/// for _ in 0..suffix_len {
/// key.pop();
/// }
/// }
/// }
///
/// assert_eq!(&results, data);
/// ```
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
}
/// Checks whether the cursor points to an empty trie.
///
/// Use this to determine when to stop iterating.
#[inline]
pub fn is_empty(&self) -> bool {
self.trie.is_empty()
}
}
impl ZeroAsciiIgnoreCaseTrieCursor<'_> {
/// Steps the cursor one byte into the trie.
///
/// Returns the byte if matched, which may be a different case than the input byte.
/// If this function returns `None`, any lookup loops can be terminated.
///
/// # Examples
///
/// Normalize the case of a value by stepping through an ignore-case trie:
///
/// ```
/// use std::borrow::Cow;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// // A trie with two values: "aBc" and "aBcdEf"
/// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
///
/// // Get out the value for "abc" and normalize the key string
/// let mut cursor = trie.cursor();
/// let mut key_str = Cow::Borrowed("abc".as_bytes());
/// let mut i = 0;
/// let value = loop {
/// let Some(&input_byte) = key_str.get(i) else {
/// break cursor.take_value();
/// };
/// let Some(matched_byte) = cursor.step(input_byte) else {
/// break None;
/// };
/// if matched_byte != input_byte {
/// key_str.to_mut()[i] = matched_byte;
/// }
/// i += 1;
/// };
///
/// assert_eq!(value, Some(0));
/// assert_eq!(&*key_str, "aBc".as_bytes());
/// ```
///
/// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
#[inline]
pub fn step(&mut self, byte: u8) -> Option<u8> {
reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
}
/// Takes the value at the current position.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
#[inline]
pub fn take_value(&mut self) -> Option<usize> {
reader::take_value(&mut self.trie.store)
}
/// Probes the next byte in the cursor.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
}
/// Checks whether the cursor points to an empty trie.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
#[inline]
pub fn is_empty(&self) -> bool {
self.trie.is_empty()
}
}
impl fmt::Write for ZeroTrieSimpleAsciiCursor<'_> {
/// Steps the cursor through each ASCII byte of the string.
///
/// If the string contains non-ASCII chars, an error is returned.
///
/// # Examples
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// cursor.write_str("abcdxy").expect("all ASCII");
/// cursor.write_str("🚂").expect_err("non-ASCII");
/// ```
fn write_str(&mut self, s: &str) -> fmt::Result {
for b in s.bytes() {
if !b.is_ascii() {
return Err(fmt::Error);
}
self.step(b);
}
Ok(())
}
/// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns
/// an error if the char is non-ASCII.
///
/// # Examples
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// cursor.write_char('a').expect("ASCII");
/// cursor.write_char('x').expect("ASCII");
/// cursor.write_char('🚂').expect_err("non-ASCII");
/// ```
fn write_char(&mut self, c: char) -> fmt::Result {
if !c.is_ascii() {
return Err(fmt::Error);
}
self.step(c as u8);
Ok(())
}
}
impl fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'_> {
/// Steps the cursor through each ASCII byte of the string.
///
/// If the string contains non-ASCII chars, an error is returned.
fn write_str(&mut self, s: &str) -> fmt::Result {
for b in s.bytes() {
if !b.is_ascii() {
return Err(fmt::Error);
}
self.step(b);
}
Ok(())
}
/// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns
/// an error if the char is non-ASCII.
fn write_char(&mut self, c: char) -> fmt::Result {
if !c.is_ascii() {
return Err(fmt::Error);
}
self.step(c as u8);
Ok(())
}
}

25
vendor/zerotrie/src/error.rs vendored Normal file
View File

@@ -0,0 +1,25 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use displaydoc::Display;
/// Error types for the `zerotrie` crate.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Display)]
#[non_exhaustive]
pub enum ZeroTrieBuildError {
/// Non-ASCII data was added to an ASCII-only trie.
#[displaydoc("Non-ASCII cannot be added to an ASCII-only trie")]
NonAsciiError,
/// The trie reached its maximum supported capacity.
#[displaydoc("Reached maximum capacity of trie")]
CapacityExceeded,
/// The builder could not solve the perfect hash function.
#[displaydoc("Failed to solve the perfect hash function. This is rare! Please report your case to the ICU4X team.")]
CouldNotSolvePerfectHash,
/// Mixed-case data was added to a case-insensitive trie.
#[displaydoc("Mixed-case data added to case-insensitive trie")]
MixedCase,
}
impl core::error::Error for ZeroTrieBuildError {}

122
vendor/zerotrie/src/helpers.rs vendored Normal file
View File

@@ -0,0 +1,122 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub(crate) trait MaybeSplitAt<T> {
/// Like slice::split_at but debug-panics and returns an empty second slice
/// if the index is out of range.
fn debug_split_at(&self, mid: usize) -> (&Self, &Self);
}
impl<T> MaybeSplitAt<T> for [T] {
#[inline]
fn debug_split_at(&self, mid: usize) -> (&Self, &Self) {
self.split_at_checked(mid).unwrap_or_else(|| {
debug_assert!(false, "debug_split_at: {mid} expected to be in range");
(self, &[])
})
}
}
pub(crate) trait DebugUnwrapOr<T> {
/// Unwraps the option or panics in debug mode, returning the `gigo_value`
fn debug_unwrap_or(self, gigo_value: T) -> T;
}
impl<T> DebugUnwrapOr<T> for Option<T> {
#[inline]
fn debug_unwrap_or(self, gigo_value: T) -> T {
match self {
Some(x) => x,
None => {
debug_assert!(false, "debug_unwrap_or called on a None value");
gigo_value
}
}
}
}
macro_rules! debug_unwrap {
($expr:expr, return $retval:expr, $($arg:tt)+) => {
match $expr {
Some(x) => x,
None => {
debug_assert!(false, $($arg)*);
return $retval;
}
}
};
($expr:expr, return $retval:expr) => {
debug_unwrap!($expr, return $retval, "invalid trie")
};
($expr:expr, break, $($arg:tt)+) => {
match $expr {
Some(x) => x,
None => {
debug_assert!(false, $($arg)*);
break;
}
}
};
($expr:expr, break) => {
debug_unwrap!($expr, break, "invalid trie")
};
($expr:expr, $($arg:tt)+) => {
debug_unwrap!($expr, return (), $($arg)*)
};
($expr:expr) => {
debug_unwrap!($expr, return ())
};
}
pub(crate) use debug_unwrap;
/// The maximum number of base-10 digits required for rendering a usize.
/// Note: 24/10 is an approximation of 8*log10(2)
pub(crate) const MAX_USIZE_LEN_AS_DIGITS: usize = core::mem::size_of::<usize>() * 24 / 10 + 1;
/// Formats a usize as a string of length N, padded with spaces,
/// with the given prefix.
///
/// # Panics
///
/// If the string is too short, the function may panic. To prevent
/// this, N should be MAX_USIZE_LEN_AS_DIGITS larger than M.
#[allow(clippy::indexing_slicing)] // documented, and based on const parameters
pub(crate) const fn const_fmt_int<const M: usize, const N: usize>(
prefix: [u8; M],
value: usize,
) -> [u8; N] {
let mut output = [b' '; N];
let mut i = 0;
while i < M {
output[i] = prefix[i];
i += 1;
}
let mut int_only = [b' '; MAX_USIZE_LEN_AS_DIGITS];
let mut value = value;
let mut i = MAX_USIZE_LEN_AS_DIGITS - 1;
loop {
let x = (value % 10) as u8;
int_only[i] = x + b'0';
value /= 10;
if value == 0 {
break;
}
i -= 1;
}
let mut j = M;
while i < MAX_USIZE_LEN_AS_DIGITS {
output[j] = int_only[i];
j += 1;
i += 1;
}
output
}
#[test]
fn test_const_fmt_int() {
assert_eq!(*b"123", const_fmt_int::<0, 3>(*b"", 123));
assert_eq!(*b"123 ", const_fmt_int::<0, 6>(*b"", 123));
assert_eq!(*b"abc123", const_fmt_int::<3, 6>(*b"abc", 123));
}

87
vendor/zerotrie/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,87 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A data structure offering zero-copy storage and retrieval of byte strings, with a focus
//! on the efficient storage of ASCII strings. Strings are mapped to `usize` values.
//!
//! [`ZeroTrie`] does not support mutation because doing so would require recomputing the entire
//! data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`].
//!
//! There are multiple variants of [`ZeroTrie`] optimized for different use cases.
//!
//! # Examples
//!
//! ```
//! use zerotrie::ZeroTrie;
//!
//! let data: &[(&str, usize)] = &[("abc", 11), ("xyz", 22), ("axyb", 33)];
//!
//! let trie: ZeroTrie<Vec<u8>> = data.iter().copied().collect();
//!
//! assert_eq!(trie.get("axyb"), Some(33));
//! assert_eq!(trie.byte_len(), 18);
//! ```
//!
//! # Internal Structure
//!
//! To read about the internal structure of [`ZeroTrie`], build the docs with private modules:
//!
//! ```bash
//! cargo doc --document-private-items --all-features --no-deps --open
//! ```
//!
//! [`LiteMap`]: litemap::LiteMap
//! [`BTreeMap`]: alloc::collections::BTreeMap
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
mod builder;
mod byte_phf;
pub mod cursor;
mod error;
#[macro_use]
mod helpers;
mod options;
mod reader;
#[cfg(feature = "serde")]
mod serde;
mod varint;
mod zerotrie;
pub use crate::zerotrie::ZeroAsciiIgnoreCaseTrie;
pub use crate::zerotrie::ZeroTrie;
pub use crate::zerotrie::ZeroTrieExtendedCapacity;
pub use crate::zerotrie::ZeroTriePerfectHash;
pub use crate::zerotrie::ZeroTrieSimpleAscii;
pub use error::ZeroTrieBuildError;
#[cfg(feature = "alloc")]
pub use crate::zerotrie::ZeroTrieStringIterator;
#[cfg(feature = "alloc")]
pub use reader::ZeroTrieIterator;
#[doc(hidden)]
pub mod _internal {
pub use crate::byte_phf::f1;
pub use crate::byte_phf::f2;
pub use crate::byte_phf::PerfectByteHashMap;
}

153
vendor/zerotrie/src/options.rs vendored Normal file
View File

@@ -0,0 +1,153 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Options for building and reading from a ZeroTrie.
//!
//! These options are internal to the crate. A small selection of options
//! are exported by way of the different public types on this crate.
/// Whether to use the perfect hash function in the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum PhfMode {
/// Use binary search for all branch nodes.
BinaryOnly,
/// Use the perfect hash function for large branch nodes.
UsePhf,
}
impl PhfMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::BinaryOnly => 0,
Self::UsePhf => 0x1,
}
}
}
/// Whether to support non-ASCII data in the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum AsciiMode {
/// Support only ASCII, returning an error if non-ASCII is found.
AsciiOnly,
/// Support all data, creating span nodes for non-ASCII bytes.
BinarySpans,
}
impl AsciiMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::AsciiOnly => 0,
Self::BinarySpans => 0x2,
}
}
}
/// Whether to enforce a limit to the capacity of the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum CapacityMode {
/// Return an error if the trie requires a branch of more than 2^32 bytes.
Normal,
/// Construct the trie without returning an error.
Extended,
}
impl CapacityMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::Normal => 0,
Self::Extended => 0x4,
}
}
}
/// How to handle strings with mixed ASCII case at a node, such as "abc" and "Abc"
#[derive(Copy, Clone)]
pub(crate) enum CaseSensitivity {
/// Allow all strings and sort them by byte value.
Sensitive,
/// Reject strings with different case and sort them as if `to_ascii_lowercase` is called.
IgnoreCase,
}
impl CaseSensitivity {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::Sensitive => 0,
Self::IgnoreCase => 0x8,
}
}
}
#[derive(Copy, Clone)]
pub(crate) struct ZeroTrieBuilderOptions {
pub phf_mode: PhfMode,
pub ascii_mode: AsciiMode,
pub capacity_mode: CapacityMode,
pub case_sensitivity: CaseSensitivity,
}
impl ZeroTrieBuilderOptions {
#[cfg(feature = "serde")]
pub(crate) const fn to_u8_flags(self) -> u8 {
self.phf_mode.to_u8_flag()
| self.ascii_mode.to_u8_flag()
| self.capacity_mode.to_u8_flag()
| self.case_sensitivity.to_u8_flag()
}
}
pub(crate) trait ZeroTrieWithOptions {
const OPTIONS: ZeroTrieBuilderOptions;
}
/// All branch nodes are binary search
/// and there are no span nodes.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}
impl<S: ?Sized> crate::ZeroTrieSimpleAscii<S> {
#[cfg(feature = "serde")]
pub(crate) const FLAGS: u8 = Self::OPTIONS.to_u8_flags();
}
/// All branch nodes are binary search
/// and nodes use case-insensitive matching.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::IgnoreCase,
};
}
/// Branch nodes could be either binary search or PHF.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTriePerfectHash<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}
/// No limited capacity assertion.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Extended,
case_sensitivity: CaseSensitivity::Sensitive,
};
}

731
vendor/zerotrie/src/reader.rs vendored Normal file
View File

@@ -0,0 +1,731 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! # Internal layout of ZeroTrie
//!
//! A ZeroTrie is composed of a series of nodes stored in sequence in a byte slice.
//!
//! There are 4 types of nodes:
//!
//! 1. ASCII (`0xxxxxxx`): matches a literal ASCII byte.
//! 2. Span (`101xxxxx`): matches a span of non-ASCII bytes.
//! 3. Value (`100xxxxx`): associates a value with a string
//! 4. Branch (`11xxxxxx`): matches one of a set of bytes.
//!
//! Span, Value, and Branch nodes contain a varint, which has different semantics for each:
//!
//! - Span varint: length of the span
//! - Value varint: value associated with the string
//! - Branch varint: number of edges in the branch and width of the offset table
//!
//! If reading an ASCII, Span, or Branch node, one or more bytes are consumed from the input
//! string. If the next byte(s) in the input string do not match the node, we return `None`.
//! If reading a Value node, if the string is empty, return `Some(value)`; otherwise, we skip
//! the Value node and continue on to the next node.
//!
//! When a node is consumed, a shorter, well-formed ZeroTrie remains.
//!
//! ### Basic Example
//!
//! Here is an example ZeroTrie without branch nodes:
//!
//! ```
//! use zerotrie::ZeroTriePerfectHash;
//!
//! let bytes = [
//! b'a', // ASCII literal
//! 0b10001010, // value 10
//! b'b', // ASCII literal
//! 0b10100011, // span of 3
//! 0x81, // first byte in span
//! 0x91, // second byte in span
//! 0xA1, // third and final byte in span
//! 0b10000100, // value 4
//! ];
//!
//! let trie = ZeroTriePerfectHash::from_bytes(&bytes);
//!
//! // First value: "a" → 10
//! assert_eq!(trie.get(b"a"), Some(10));
//!
//! // Second value: "ab\x81\x91\xA1" → 4
//! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4));
//!
//! // A few examples of strings that do NOT have values in the trie:
//! assert_eq!(trie.get(b"ab"), None);
//! assert_eq!(trie.get(b"b"), None);
//! assert_eq!(trie.get(b"b\x81\x91\xA1"), None);
//! ```
//!
//! ## Branch Nodes
//!
//! There are two types of branch nodes: binary search and perfect hash. `ZeroTrieSimpleAscii`
//! contains only binary search nodes, whereas `ZeroTriePerfectHash` can contain either.
//!
//! The head node of the branch has a varint that encodes two things:
//!
//! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256
//! - Bits 9 and 10: width of the offset table (`W`)
//!
//! Note that N is always in the range [1, 256]. There can't be more than 256 edges because
//! there are only 256 unique u8 values.
//!
//! A few examples of the head node of the branch:
//!
//! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0
//! - `0b11000110`: varint bits `110`: N = 6; W = 0
//! - `0b11100000 0b00000101`: varint bits `1000101`: N = 69; W = 0
//! - `0b11100010 0b00000000`: varint bits `101000000`: N = 64; W = 1
//!
//! In `ZeroTriePerfectHash`, if N <= 15, the branch is assumed to be a binary search, and if
//! N > 15, the branch is assumed to be a perfect hash.
//!
//! ### Binary Search Branch Nodes
//!
//! A binary search branch node is used when:
//!
//! 1. The trie is a `ZeroTrieSimpleAscii`, OR
//! 2. There are 15 or fewer items in the branch.
//!
//! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte
//! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search),
//! the index `i` of the byte within the list is used to index into the offset table (described
//! below). If the byte is not in the list, the string is not in the trie, so return `None`.
//!
//! ### Perfect Hash Branch Nodes
//!
//! A perfect hash branch node is used when:
//!
//! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND
//! 2. There are 16 or more items in the branch.
//!
//! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing
//! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an
//! index within the hash table `i` is resolved and used as input to index into the offset
//! table (described below), or the value is determined to not be present and `None` is
//! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`].
//!
//! ### Offset Tables
//!
//! The _offset table_ encodes the range of the remaining buffer containing the trie reachable
//! from the byte matched in the branch node. Both types of branch nodes include an offset
//! table followig the key lookup. Given the index `i` from the first step, the range
//! `[s_i, s_(i+1))` brackets the next step in the trie.
//!
//! Offset tables utilize the `W` parameter stored in the branch head node. The special case
//! when `W == 0`, with `N - 1` bytes, is easiest to understand:
//!
//! **Offset table, W = 0:** `[s_1, s_2, ..., s_(N-1)]`
//!
//! Note that `s_0` is always 0 and `s_N` is always the length of the remaining slice, so those
//! values are not explicitly included in the offset table.
//!
//! When W > 0, the high and low bits of the offsets are in separate bytes, arranged as follows:
//!
//! **Generalized offset table:** `[a_1, a_2, ..., a_(N-1), b_1, b_2, ..., b_(N-1), c_1, ...]`
//!
//! where `s_i = (a_i << 8 + b_i) << 8 + c_i ...` (high bits first, low bits last)
//!
//! ### Advanced Example
//!
//! The following trie encodes the following map. It has multiple varints and branch nodes, which
//! are all binary search with W = 0. Note that there is a value for the empty string.
//!
//! - "" → 0
//! - "axb" → 100
//! - "ayc" → 2
//! - "azd" → 3
//! - "bxe" → 4
//! - "bxefg" → 500
//! - "bxefh" → 6
//! - "bxei" → 7
//! - "bxeikl" → 8
//!
//! ```
//! use zerotrie::ZeroTrieSimpleAscii;
//!
//! let bytes = [
//! 0b10000000, // value 0
//! 0b11000010, // branch of 2
//! b'a', //
//! b'b', //
//! 13, //
//! 0b11000011, // start of 'a' subtree: branch of 3
//! b'x', //
//! b'y', //
//! b'z', //
//! 3, //
//! 5, //
//! b'b', //
//! 0b10010000, // value 100 (lead)
//! 0x54, // value 100 (trail)
//! b'c', //
//! 0b10000010, // value 2
//! b'd', //
//! 0b10000011, // value 3
//! b'x', // start of 'b' subtree
//! b'e', //
//! 0b10000100, // value 4
//! 0b11000010, // branch of 2
//! b'f', //
//! b'i', //
//! 7, //
//! 0b11000010, // branch of 2
//! b'g', //
//! b'h', //
//! 2, //
//! 0b10010011, // value 500 (lead)
//! 0x64, // value 500 (trail)
//! 0b10000110, // value 6
//! 0b10000111, // value 7
//! b'k', //
//! b'l', //
//! 0b10001000, // value 8
//! ];
//!
//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes);
//!
//! // Assert that the specified items are in the map
//! assert_eq!(trie.get(b""), Some(0));
//! assert_eq!(trie.get(b"axb"), Some(100));
//! assert_eq!(trie.get(b"ayc"), Some(2));
//! assert_eq!(trie.get(b"azd"), Some(3));
//! assert_eq!(trie.get(b"bxe"), Some(4));
//! assert_eq!(trie.get(b"bxefg"), Some(500));
//! assert_eq!(trie.get(b"bxefh"), Some(6));
//! assert_eq!(trie.get(b"bxei"), Some(7));
//! assert_eq!(trie.get(b"bxeikl"), Some(8));
//!
//! // Assert that some other items are not in the map
//! assert_eq!(trie.get(b"a"), None);
//! assert_eq!(trie.get(b"bx"), None);
//! assert_eq!(trie.get(b"xba"), None);
//! ```
use crate::byte_phf::PerfectByteHashMap;
use crate::cursor::AsciiProbeResult;
use crate::helpers::*;
use crate::options::*;
use crate::varint::read_varint_meta2;
use crate::varint::read_varint_meta3;
#[cfg(feature = "alloc")]
use alloc::string::String;
/// Given a slice starting with an offset table, returns the trie for the given index.
///
/// Arguments:
/// - `trie` = a trie pointing at an offset table (after the branch node and search table)
/// - `i` = the desired index within the offset table
/// - `n` = the number of items in the offset table
/// - `w` = the width of the offset table items minus one
#[inline]
fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> &[u8] {
let mut p = 0usize;
let mut q = 0usize;
loop {
let indices;
(indices, trie) = trie.debug_split_at(n - 1);
p = (p << 8)
+ if i == 0 {
0
} else {
*indices.get(i - 1).debug_unwrap_or(&0) as usize
};
q = match indices.get(i) {
Some(x) => (q << 8) + *x as usize,
None => trie.len(),
};
if w == 0 {
break;
}
w -= 1;
}
trie.get(p..q).debug_unwrap_or(&[])
}
/// Version of [`get_branch()`] specialized for the case `w == 0` for performance
#[inline]
fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> &[u8] {
let indices;
(indices, trie) = trie.debug_split_at(n - 1);
let p = if i == 0 {
0
} else {
*indices.get(i - 1).debug_unwrap_or(&0) as usize
};
let q = match indices.get(i) {
Some(x) => *x as usize,
None => trie.len(),
};
trie.get(p..q).debug_unwrap_or(&[])
}
/// The node type. See the module-level docs for more explanation of the four node types.
enum NodeType {
/// An ASCII node. Contains a single literal ASCII byte and no varint.
Ascii,
/// A span node. Contains a varint indicating how big the span is.
Span,
/// A value node. Contains a varint representing the value.
Value,
/// A branch node. Contains a varint of the number of output nodes, plus W in the high bits.
Branch,
}
impl core::fmt::Debug for NodeType {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use NodeType::*;
f.write_str(match *self {
Ascii => "a",
Span => "s",
Value => "v",
Branch => "m",
})
}
}
#[inline]
fn byte_type(b: u8) -> NodeType {
match b & 0b11100000 {
0b10000000 => NodeType::Value,
0b10100000 => NodeType::Span,
0b11000000 => NodeType::Branch,
0b11100000 => NodeType::Branch,
_ => NodeType::Ascii,
}
}
#[inline]
pub(crate) fn get_parameterized<T: ZeroTrieWithOptions + ?Sized>(
mut trie: &[u8],
mut ascii: &[u8],
) -> Option<usize> {
loop {
let (b, x, i, search);
(b, trie) = trie.split_first()?;
let byte_type = byte_type(*b);
(x, trie) = match byte_type {
NodeType::Ascii => (0, trie),
NodeType::Span => {
if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) {
read_varint_meta3(*b, trie)
} else {
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
}
NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Branch => read_varint_meta2(*b, trie),
};
if let Some((c, temp)) = ascii.split_first() {
if matches!(byte_type, NodeType::Ascii) {
let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase)
{
b.eq_ignore_ascii_case(c)
} else {
b == c
};
if is_match {
// Matched a byte
ascii = temp;
continue;
} else {
// Byte that doesn't match
return None;
}
}
if matches!(byte_type, NodeType::Value) {
// Value node, but not at end of string
continue;
}
if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans)
&& matches!(byte_type, NodeType::Span)
{
let (trie_span, ascii_span);
(trie_span, trie) = trie.debug_split_at(x);
(ascii_span, ascii) = ascii.split_at_checked(x)?;
if trie_span == ascii_span {
// Matched a byte span
continue;
} else {
// Byte span that doesn't match
return None;
}
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
let w = if matches!(T::OPTIONS.capacity_mode, CapacityMode::Extended) {
w
} else {
// See the table below regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
w & 0x3
};
let x = if x == 0 { 256 } else { x };
if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 {
// binary search
(search, trie) = trie.debug_split_at(x);
let bsearch_result =
if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) {
search.binary_search_by_key(&c.to_ascii_lowercase(), |x| {
x.to_ascii_lowercase()
})
} else {
search.binary_search(c)
};
i = bsearch_result.ok()?;
} else {
// phf
(search, trie) = trie.debug_split_at(x * 2 + 1);
i = PerfectByteHashMap::from_store(search).get(*c)?;
}
trie = if w == 0 {
get_branch_w0(trie, i, x)
} else {
get_branch(trie, i, x, w)
};
ascii = temp;
continue;
} else {
if matches!(byte_type, NodeType::Value) {
// Value node at end of string
return Some(x);
}
return None;
}
}
}
// DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w.
//
// | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 |
// |---------------|-------------------|--------------------|-------------------|--------------------|
// | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns |
// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs |
// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs |
/// Steps one node into the trie assuming all branch nodes are binary search and that
/// there are no span nodes.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie reachable by `c`.
#[inline]
pub(crate) fn step_parameterized<T: ZeroTrieWithOptions + ?Sized>(
trie: &mut &[u8],
c: u8,
) -> Option<u8> {
// Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
// `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
// If a span node is encountered, `None` is returned later in this function.
debug_assert!(
matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly),
"Spans not yet implemented in step function"
);
// PHF can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly),
"PHF not yet implemented in step function"
);
// Extended Capacity can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal),
"Extended capacity not yet implemented in step function"
);
let (mut b, x, search);
loop {
(b, *trie) = match trie.split_first() {
Some(v) => v,
None => {
// Empty trie or only a value node
return None;
}
};
match byte_type(*b) {
NodeType::Ascii => {
let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase)
{
b.eq_ignore_ascii_case(&c)
} else {
*b == c
};
if is_match {
// Matched a byte
return Some(*b);
} else {
// Byte that doesn't match
*trie = &[];
return None;
}
}
NodeType::Branch => {
// Proceed to the branch node logic below
(x, *trie) = read_varint_meta2(*b, trie);
break;
}
NodeType::Span => {
// Question: Should we put the trie back into a valid state?
// Currently this code is unreachable so let's not worry about it.
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
NodeType::Value => {
// Skip the value node and go to the next node
(_, *trie) = read_varint_meta3(*b, trie);
continue;
}
};
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
// See comment above regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
let w = w & 0x3;
let x = if x == 0 { 256 } else { x };
// Always use binary search
(search, *trie) = trie.debug_split_at(x);
let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) {
search.binary_search_by_key(&c.to_ascii_lowercase(), |x| x.to_ascii_lowercase())
} else {
search.binary_search(&c)
};
match bsearch_result {
Ok(i) => {
// Matched a byte
*trie = if w == 0 {
get_branch_w0(trie, i, x)
} else {
get_branch(trie, i, x, w)
};
#[allow(clippy::indexing_slicing)] // i is from a binary search
Some(search[i])
}
Err(_) => {
// Byte that doesn't match
*trie = &[];
None
}
}
}
/// Steps one node into the trie, assuming all branch nodes are binary search and that
/// there are no span nodes, using an index.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie indexed by `index`.
#[inline]
pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
trie: &mut &[u8],
index: usize,
) -> Option<AsciiProbeResult> {
// Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
// `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
// If a span node is encountered, `None` is returned later in this function.
debug_assert!(
matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly),
"Spans not yet implemented in step function"
);
// PHF can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly),
"PHF not yet implemented in step function"
);
// Extended Capacity can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal),
"Extended capacity not yet implemented in step function"
);
let (mut b, x, search);
loop {
(b, *trie) = match trie.split_first() {
Some(v) => v,
None => {
// Empty trie or only a value node
return None;
}
};
match byte_type(*b) {
NodeType::Ascii => {
if index > 0 {
*trie = &[];
return None;
}
return Some(AsciiProbeResult {
byte: *b,
total_siblings: 1,
});
}
NodeType::Branch => {
// Proceed to the branch node logic below
(x, *trie) = read_varint_meta2(*b, trie);
break;
}
NodeType::Span => {
// Question: Should we put the trie back into a valid state?
// Currently this code is unreachable so let's not worry about it.
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
NodeType::Value => {
// Skip the value node and go to the next node
(_, *trie) = read_varint_meta3(*b, trie);
continue;
}
};
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
debug_assert!(u8::try_from(x).is_ok());
let total_siblings = x as u8;
// See comment above regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
let w = w & 0x3;
let x = if x == 0 { 256 } else { x };
if index >= x {
*trie = &[];
return None;
}
(search, *trie) = trie.debug_split_at(x);
*trie = if w == 0 {
get_branch_w0(trie, index, x)
} else {
get_branch(trie, index, x, w)
};
Some(AsciiProbeResult {
#[allow(clippy::indexing_slicing)] // index < x, the length of search
byte: search[index],
total_siblings,
})
}
/// Steps one node into the trie if the head node is a value node, returning the value.
/// If the head node is not a value node, no change is made.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie with the value node removed.
pub(crate) fn take_value(trie: &mut &[u8]) -> Option<usize> {
let (b, new_trie) = trie.split_first()?;
match byte_type(*b) {
NodeType::Ascii | NodeType::Span | NodeType::Branch => None,
NodeType::Value => {
let x;
(x, *trie) = read_varint_meta3(*b, new_trie);
Some(x)
}
}
}
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
/// Iterator type for walking the byte sequences contained in a ZeroTrie.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
#[derive(Debug)]
pub struct ZeroTrieIterator<'a> {
/// Whether the PHF is enabled on this trie.
use_phf: bool,
/// Intermediate state during iteration:
/// 1. A trie (usually a slice of the original, bigger trie)
/// 2. The string that leads to the trie
/// 3. If the trie's lead node is a branch node, the current index being evaluated
state: Vec<(&'a [u8], Vec<u8>, usize)>,
}
#[cfg(feature = "alloc")]
impl<'a> ZeroTrieIterator<'a> {
pub(crate) fn new<S: AsRef<[u8]> + ?Sized>(store: &'a S, use_phf: bool) -> Self {
ZeroTrieIterator {
use_phf,
state: alloc::vec![(store.as_ref(), alloc::vec![], 0)],
}
}
}
#[cfg(feature = "alloc")]
impl Iterator for ZeroTrieIterator<'_> {
type Item = (Vec<u8>, usize);
fn next(&mut self) -> Option<Self::Item> {
let (mut trie, mut string, mut branch_idx);
(trie, string, branch_idx) = self.state.pop()?;
loop {
let (b, x, span, search);
let return_trie = trie;
(b, trie) = match trie.split_first() {
Some(tpl) => tpl,
None => {
// At end of current branch; step back to the branch node.
// If there are no more branches, we are finished.
(trie, string, branch_idx) = self.state.pop()?;
continue;
}
};
let byte_type = byte_type(*b);
if matches!(byte_type, NodeType::Ascii) {
string.push(*b);
continue;
}
(x, trie) = match byte_type {
NodeType::Ascii => (0, trie),
NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Branch => read_varint_meta2(*b, trie),
};
if matches!(byte_type, NodeType::Span) {
(span, trie) = trie.debug_split_at(x);
string.extend(span);
continue;
}
if matches!(byte_type, NodeType::Value) {
let retval = string.clone();
// Return to this position on the next step
self.state.push((trie, string, 0));
return Some((retval, x));
}
// Match node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
let x = if x == 0 { 256 } else { x };
if branch_idx + 1 < x {
// Return to this branch node at the next index
self.state
.push((return_trie, string.clone(), branch_idx + 1));
}
let byte = if x < 16 || !self.use_phf {
// binary search
(search, trie) = trie.debug_split_at(x);
debug_unwrap!(search.get(branch_idx), return None)
} else {
// phf
(search, trie) = trie.debug_split_at(x * 2 + 1);
debug_unwrap!(search.get(branch_idx + x + 1), return None)
};
string.push(*byte);
trie = if w == 0 {
get_branch_w0(trie, branch_idx, x)
} else {
get_branch(trie, branch_idx, x, w)
};
branch_idx = 0;
}
}
}
#[cfg(feature = "alloc")]
pub(crate) fn get_iter_phf<S: AsRef<[u8]> + ?Sized>(store: &S) -> ZeroTrieIterator<'_> {
ZeroTrieIterator::new(store, true)
}
/// # Panics
/// Panics if the trie contains non-ASCII items.
#[cfg(feature = "alloc")]
#[expect(clippy::type_complexity)]
pub(crate) fn get_iter_ascii_or_panic<S: AsRef<[u8]> + ?Sized>(
store: &S,
) -> core::iter::Map<ZeroTrieIterator<'_>, fn((Vec<u8>, usize)) -> (String, usize)> {
ZeroTrieIterator::new(store, false).map(|(k, v)| {
#[expect(clippy::unwrap_used)] // in signature of function
let ascii_str = String::from_utf8(k).unwrap();
(ascii_str, v)
})
}

644
vendor/zerotrie/src/serde.rs vendored Normal file
View File

@@ -0,0 +1,644 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::builder::bytestr::ByteStr;
use crate::options::ZeroTrieWithOptions;
use crate::zerotrie::ZeroTrieFlavor;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrie;
use crate::ZeroTrieExtendedCapacity;
use crate::ZeroTriePerfectHash;
use crate::ZeroTrieSimpleAscii;
use alloc::boxed::Box;
use alloc::vec::Vec;
use core::fmt;
use litemap::LiteMap;
use serde_core::de::Error;
use serde_core::de::Visitor;
use serde_core::Deserialize;
use serde_core::Deserializer;
use serde_core::Serialize;
use serde_core::Serializer;
struct ByteStrVisitor;
impl<'de> Visitor<'de> for ByteStrVisitor {
type Value = Box<[u8]>;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "a slice of borrowed bytes or a string")
}
fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E> {
Ok(Box::from(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Box::from(v.as_bytes()))
}
fn visit_seq<A>(self, mut v: A) -> Result<Self::Value, A::Error>
where
A: serde_core::de::SeqAccess<'de>,
{
let mut result = Vec::with_capacity(v.size_hint().unwrap_or(0));
while let Some(x) = v.next_element::<u8>()? {
result.push(x);
}
Ok(Box::from(result))
}
}
impl<'data, 'de: 'data> Deserialize<'de> for &'data ByteStr {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = <&'data [u8]>::deserialize(deserializer)?;
Ok(ByteStr::from_bytes(s))
}
}
impl<'de> Deserialize<'de> for Box<ByteStr> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let s = deserializer.deserialize_any(ByteStrVisitor)?;
Ok(ByteStr::from_boxed_bytes(s))
} else {
let s = Vec::<u8>::deserialize(deserializer)?;
Ok(ByteStr::from_boxed_bytes(s.into_boxed_slice()))
}
}
}
impl Serialize for &ByteStr {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let bytes = self.as_bytes();
if serializer.is_human_readable() {
match core::str::from_utf8(bytes) {
Ok(s) => serializer.serialize_str(s),
Err(_) => serializer.serialize_bytes(bytes),
}
} else {
serializer.serialize_bytes(bytes)
}
}
}
impl<'data, 'de: 'data, Store> Deserialize<'de> for ZeroTrieSimpleAscii<Store>
where
// DISCUSS: There are several possibilities for the bounds here that would
// get the job done. I could look for Deserialize, but this would require
// creating a custom Deserializer for the map case. I also considered
// introducing a new trait instead of relying on From.
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrieSimpleAscii::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
};
Ok(ZeroTrieSimpleAscii::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroTrieSimpleAscii<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(Self::FLAGS, ByteStr::from_bytes(self.as_bytes())).serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroAsciiIgnoreCaseTrie<Store>
where
'de: 'data,
// DISCUSS: There are several possibilities for the bounds here that would
// get the job done. I could look for Deserialize, but this would require
// creating a custom Deserializer for the map case. I also considered
// introducing a new trait instead of relying on From.
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroAsciiIgnoreCaseTrie::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroAsciiIgnoreCaseTrie::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroAsciiIgnoreCaseTrie<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTriePerfectHash<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTriePerfectHash::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroTriePerfectHash::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroTriePerfectHash<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTrieExtendedCapacity<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrieExtendedCapacity::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroTrieExtendedCapacity::from_store(Store::from(
trie_bytes,
)))
}
}
}
impl<Store> Serialize for ZeroTrieExtendedCapacity<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTrie<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrie::<Vec<u8>>::try_from(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let bytes = <&[u8]>::deserialize(deserializer)?;
let (tag, trie_bytes) = bytes
.split_first()
.ok_or(D::Error::custom("expected at least 1 byte for ZeroTrie"))?;
let store = Store::from(trie_bytes);
let zerotrie = if *tag == ZeroTrieSimpleAscii::<u8>::OPTIONS.to_u8_flags() {
ZeroTrieSimpleAscii::from_store(store).into_zerotrie()
} else if *tag == ZeroTriePerfectHash::<u8>::OPTIONS.to_u8_flags() {
ZeroTriePerfectHash::from_store(store).into_zerotrie()
} else if *tag == ZeroTrieExtendedCapacity::<u8>::OPTIONS.to_u8_flags() {
ZeroTrieExtendedCapacity::from_store(store).into_zerotrie()
} else {
return Err(D::Error::custom("invalid ZeroTrie tag"));
};
Ok(zerotrie)
}
}
}
impl<Store> Serialize for ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
let (tag, bytes) = match &self.0 {
ZeroTrieFlavor::SimpleAscii(t) => (
ZeroTrieSimpleAscii::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
ZeroTrieFlavor::PerfectHash(t) => (
ZeroTriePerfectHash::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
ZeroTrieFlavor::ExtendedCapacity(t) => (
ZeroTrieExtendedCapacity::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
};
let mut all_in_one_vec = Vec::with_capacity(bytes.len() + 1);
all_in_one_vec.push(tag);
all_in_one_vec.extend(bytes);
serializer.serialize_bytes(&all_in_one_vec)
}
}
}
#[cfg(test)]
mod testdata {
include!("../tests/data/data.rs");
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::borrow::Cow;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieSimpleAsciiCow<'a> {
#[serde(borrow)]
trie: ZeroTrieSimpleAscii<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_simpleascii_cow() {
let trie = ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroTrieSimpleAsciiCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
let rmp_bytes = rmp_serde::to_vec(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[0, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
assert_eq!(&rmp_bytes[0..5], &[145, 146, 0, 196, 26]);
assert_eq!(&rmp_bytes[5..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieSimpleAsciiCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieSimpleAsciiCow =
bincode::deserialize(&bincode_bytes).unwrap();
let rmp_recovered: ZeroTrieSimpleAsciiCow = rmp_serde::from_slice(&rmp_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert_eq!(original.trie, rmp_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroAsciiIgnoreCaseTrieCow<'a> {
#[serde(borrow)]
trie: ZeroAsciiIgnoreCaseTrie<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_asciiignorecase_cow() {
let trie = ZeroAsciiIgnoreCaseTrie::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroAsciiIgnoreCaseTrieCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[8, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroAsciiIgnoreCaseTrieCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroAsciiIgnoreCaseTrieCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTriePerfectHashCow<'a> {
#[serde(borrow)]
trie: ZeroTriePerfectHash<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_perfecthash_cow() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_perfecthash_cow_u() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE);
assert_eq!(&bincode_bytes[0..9], &[3, 39, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_UNICODE);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_perfecthash_cow_bin() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_BINARY));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_BINARY);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_BINARY);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieAnyCow<'a> {
#[serde(borrow)]
trie: ZeroTrie<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_any_cow() {
let trie =
ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII)).into_zerotrie();
let original = ZeroTrieAnyCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[27, 0, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_any_cow_u() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE))
.into_zerotrie();
let original = ZeroTrieAnyCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE);
assert_eq!(&bincode_bytes[0..9], &[40, 0, 0, 0, 0, 0, 0, 0, 3]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_UNICODE);
let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
}
#[cfg(test)]
#[cfg(feature = "zerovec")]
mod tests_zerovec {
use super::*;
use serde::{Deserialize, Serialize};
use zerovec::ZeroVec;
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieSimpleAsciiZeroVec<'a> {
#[serde(borrow)]
trie: ZeroTrieSimpleAscii<ZeroVec<'a, u8>>,
}
#[test]
pub fn test_serde_simpleascii_zerovec() {
let trie =
ZeroTrieSimpleAscii::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII));
let original = ZeroTrieSimpleAsciiZeroVec { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[0, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieSimpleAsciiZeroVec = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieSimpleAsciiZeroVec =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(json_recovered.trie.into_store().is_owned());
assert!(!bincode_recovered.trie.into_store().is_owned());
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTriePerfectHashZeroVec<'a> {
#[serde(borrow)]
trie: ZeroTriePerfectHash<ZeroVec<'a, u8>>,
}
#[test]
pub fn test_serde_perfecthash_zerovec() {
let trie =
ZeroTriePerfectHash::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII));
let original = ZeroTriePerfectHashZeroVec { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTriePerfectHashZeroVec = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashZeroVec =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(json_recovered.trie.into_store().is_owned());
assert!(!bincode_recovered.trie.into_store().is_owned());
}
}

520
vendor/zerotrie/src/varint.rs vendored Normal file
View File

@@ -0,0 +1,520 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Varint spec for ZeroTrie:
//!
//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
//! - Trail bytes: top bit is varint extender; rest are low bits of value
//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
//!
//! This is best shown by examples.
//!
//! ```txt
//! xxx0'1010 = 10
//! xxx0'1111 = 15 (largest single-byte value with M=3)
//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
//! xxx1'0000 0000'0001 = 17
//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
//! xxx1'0000 1000'0000 0000'0001 = 2065
//! ```
//!
//! The latent values by number of bytes for M=3 are:
//!
//! - 1 byte: 0
//! - 2 bytes: 16 = 0x10 = 0b10000
//! - 3 bytes: 2064 = 0x810 = 0b100000010000
//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
//! - …
//!
//! For M=2, the latent values are:
//!
//! - 1 byte: 0
//! - 2 bytes: 32 = 0x20 = 0b100000
//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
//! - …
use crate::builder::konst::ConstArrayBuilder;
#[cfg(feature = "alloc")]
use crate::builder::nonconst::TrieBuilderStore;
/// Reads a varint with 2 bits of metadata in the lead byte.
///
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
///
/// If the varint spills off the end of the slice, a debug assertion will fail,
/// and the function will return the value up to that point.
pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
let mut value = (start & 0b00011111) as usize;
let mut remainder = remainder;
if (start & 0b00100000) != 0 {
loop {
let next;
(next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
if (*next & 0b10000000) == 0 {
break;
}
}
}
(value, remainder)
}
/// Reads a varint with 3 bits of metadata in the lead byte.
///
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
///
/// If the varint spills off the end of the slice, a debug assertion will fail,
/// and the function will return the value up to that point.
pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
let mut value = (start & 0b00001111) as usize;
let mut remainder = remainder;
if (start & 0b00010000) != 0 {
loop {
let next;
(next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
if (*next & 0b10000000) == 0 {
break;
}
}
}
(value, remainder)
}
/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
///
/// Returns the varint value.
#[cfg(feature = "alloc")]
pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
start: u8,
remainder: &mut S,
) -> Option<usize> {
let mut value = (start & 0b00001111) as usize;
if (start & 0b00010000) != 0 {
loop {
let next = remainder.atbs_pop_front()?;
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((next & 0b01111111) as usize) + 16;
if (next & 0b10000000) == 0 {
break;
}
}
}
Some(value)
}
#[cfg(test)]
const MAX_VARINT: usize = usize::MAX;
// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
// Add an extra 1 since the lead byte holds only 5 bits of data.
const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
let mut i = MAX_VARINT_LENGTH - 1;
let mut value = value;
let mut last = true;
loop {
if value < 32 {
result[i] = value as u8;
if !last {
result[i] |= 0b00100000;
}
break;
}
value -= 32;
result[i] = (value as u8) & 0b01111111;
if !last {
result[i] |= 0b10000000;
} else {
last = false;
}
value >>= 7;
i -= 1;
}
// The bytes are from i to the end.
ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
}
/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
let mut i = MAX_VARINT_LENGTH - 1;
let mut value = value;
let mut last = true;
loop {
if value < 16 {
result[i] = value as u8;
if !last {
result[i] |= 0b00010000;
}
break;
}
value -= 16;
result[i] = (value as u8) & 0b01111111;
if !last {
result[i] |= 0b10000000;
} else {
last = false;
}
value >>= 7;
i -= 1;
}
// The bytes are from i to the end.
ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
}
/// A secondary implementation that separates the latent value while computing the varint.
#[cfg(test)]
pub(crate) const fn write_varint_reference(
value: usize,
) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
if value < 32 {
result[0] = value as u8;
return ConstArrayBuilder::from_manual_slice(result, 0, 1);
}
result[0] = 32;
let mut latent = 32;
let mut steps = 2;
loop {
let next_latent = (latent << 7) + 32;
if value < next_latent || next_latent == latent {
break;
}
latent = next_latent;
steps += 1;
}
let mut value = value - latent;
let mut i = steps;
while i > 0 {
i -= 1;
result[i] |= (value as u8) & 0b01111111;
value >>= 7;
if i > 0 && i < steps - 1 {
result[i] |= 0b10000000;
}
}
// The bytes are from 0 to `steps`.
ConstArrayBuilder::from_manual_slice(result, 0, steps)
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Debug)]
struct TestCase<'a> {
bytes: &'a [u8],
remainder: &'a [u8],
value: usize,
}
static CASES: &[TestCase] = &[
TestCase {
bytes: &[0b00000000],
remainder: &[],
value: 0,
},
TestCase {
bytes: &[0b00001010],
remainder: &[],
value: 10,
},
TestCase {
bytes: &[0b00011111],
remainder: &[],
value: 31,
},
TestCase {
bytes: &[0b00011111, 0b10101010],
remainder: &[0b10101010],
value: 31,
},
TestCase {
bytes: &[0b00100000, 0b00000000],
remainder: &[],
value: 32,
},
TestCase {
bytes: &[0b00100000, 0b00000001],
remainder: &[],
value: 33,
},
TestCase {
bytes: &[0b00100000, 0b00100000],
remainder: &[],
value: 64,
},
TestCase {
bytes: &[0x20, 0x44],
remainder: &[],
value: 100,
},
TestCase {
bytes: &[0b00100000, 0b01111111],
remainder: &[],
value: 159,
},
TestCase {
bytes: &[0b00100001, 0b00000000],
remainder: &[],
value: 160,
},
TestCase {
bytes: &[0b00100001, 0b00000001],
remainder: &[],
value: 161,
},
TestCase {
bytes: &[0x23, 0x54],
remainder: &[],
value: 500,
},
TestCase {
bytes: &[0b00111111, 0b01111111],
remainder: &[],
value: 4127, // 32 + (1 << 12) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b00000000],
remainder: &[],
value: 4128, // 32 + (1 << 12)
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b00000001],
remainder: &[],
value: 4129, // 32 + (1 << 12) + 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b01111111],
remainder: &[],
value: 4255, // 32 + (1 << 12) + 127
},
TestCase {
bytes: &[0b00100000, 0b10000001, 0b00000000],
remainder: &[],
value: 4256, // 32 + (1 << 12) + 128
},
TestCase {
bytes: &[0b00100000, 0b10000001, 0b00000001],
remainder: &[],
value: 4257, // 32 + (1 << 12) + 129
},
TestCase {
bytes: &[0x20, 0x86, 0x68],
remainder: &[],
value: 5000,
},
TestCase {
bytes: &[0b00100000, 0b11111111, 0b01111111],
remainder: &[],
value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
},
TestCase {
bytes: &[0b00100001, 0b10000000, 0b00000000],
remainder: &[],
value: 20512, // 32 + (1 << 12) + (1 << 14)
},
TestCase {
bytes: &[0b00111111, 0b11111111, 0b01111111],
remainder: &[],
value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
remainder: &[],
value: 528416, // 32 + (1 << 12) + (1 << 19)
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
remainder: &[],
value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
},
TestCase {
bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
remainder: &[],
value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
remainder: &[],
value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
},
];
#[test]
fn test_read() {
for cas in CASES {
let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
}
}
#[test]
fn test_read_write() {
for cas in CASES {
let reference_bytes = write_varint_reference(cas.value);
assert_eq!(
reference_bytes.len(),
cas.bytes.len() - cas.remainder.len(),
"{cas:?}"
);
assert_eq!(
reference_bytes.as_slice(),
&cas.bytes[0..reference_bytes.len()],
"{cas:?}"
);
let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
let write_bytes = write_varint_meta2(cas.value);
assert_eq!(
reference_bytes.as_slice(),
write_bytes.as_slice(),
"{cas:?}"
);
}
}
#[test]
fn test_roundtrip() {
let mut i = 0usize;
while i < MAX_VARINT {
let bytes = write_varint_meta2(i);
let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
i <<= 1;
i += 1;
}
}
#[test]
fn test_extended_roundtrip() {
let mut i = 0usize;
while i < MAX_VARINT {
let bytes = write_varint_meta3(i);
let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
i <<= 1;
i += 1;
}
}
#[test]
fn test_max() {
let reference_bytes = write_varint_reference(MAX_VARINT);
let write_bytes = write_varint_meta2(MAX_VARINT);
assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
let subarray = write_bytes
.as_const_slice()
.get_subslice_or_panic(1, write_bytes.len());
let (recovered_value, remainder) = read_varint_meta2(
*write_bytes.as_const_slice().first().unwrap(),
subarray.as_slice(),
);
assert!(remainder.is_empty());
assert_eq!(recovered_value, MAX_VARINT);
#[cfg(target_pointer_width = "64")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00100001, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b01011111, //
]
);
#[cfg(target_pointer_width = "32")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00101111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b01011111, //
]
);
}
#[test]
fn text_extended_max() {
let write_bytes = write_varint_meta3(MAX_VARINT);
assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
assert!(remainder.is_empty());
assert_eq!(recovered_value, MAX_VARINT);
#[cfg(target_pointer_width = "64")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00010001, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b01101111, //
]
);
#[cfg(target_pointer_width = "32")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00011111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b01101111, //
]
);
}
#[test]
fn test_latent_values() {
// Same values documented in the module docs: M=2
let m2 = read_varint_meta2;
assert_eq!(m2(0, &[]).0, 0);
assert_eq!(m2(0x20, &[0x00]).0, 32);
assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
// Same values documented in the module docs: M=3
let m3 = read_varint_meta3;
assert_eq!(m3(0, &[]).0, 0);
assert_eq!(m3(0x10, &[0x00]).0, 16);
assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
}
}

888
vendor/zerotrie/src/zerotrie.rs vendored Normal file
View File

@@ -0,0 +1,888 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::reader;
use core::borrow::Borrow;
#[cfg(feature = "alloc")]
use crate::{
builder::bytestr::ByteStr, builder::nonconst::ZeroTrieBuilder, error::ZeroTrieBuildError,
};
#[cfg(feature = "alloc")]
use alloc::{boxed::Box, collections::BTreeMap, collections::VecDeque, string::String, vec::Vec};
#[cfg(feature = "litemap")]
use litemap::LiteMap;
/// A data structure that compactly maps from byte sequences to integers.
///
/// There are several variants of `ZeroTrie` which are very similar but are optimized
/// for different use cases:
///
/// - [`ZeroTrieSimpleAscii`] is the most compact structure. Very fast for small data.
/// Only stores ASCII-encoded strings. Can be const-constructed!
/// - [`ZeroTriePerfectHash`] is also compact, but it also supports arbitrary binary
/// strings. It also scales better to large data. Cannot be const-constructed.
/// - [`ZeroTrieExtendedCapacity`] can be used if more than 2^32 bytes are required.
///
/// You can create a `ZeroTrie` directly, in which case the most appropriate
/// backing implementation will be chosen.
///
/// # Backing Store
///
/// The data structure has a flexible backing data store. The only requirement for most
/// functionality is that it implement `AsRef<[u8]>`. All of the following are valid
/// ZeroTrie types:
///
/// - `ZeroTrie<[u8]>` (dynamically sized type: must be stored in a reference or Box)
/// - `ZeroTrie<&[u8]>` (borrows its data from a u8 buffer)
/// - `ZeroTrie<Vec<u8>>` (fully owned data)
/// - `ZeroTrie<ZeroVec<u8>>` (the recommended borrowed-or-owned signature)
/// - `Cow<ZeroTrie<[u8]>>` (another borrowed-or-owned signature)
/// - `ZeroTrie<Cow<[u8]>>` (another borrowed-or-owned signature)
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTrie;
///
/// let mut map = LiteMap::<&[u8], usize>::new_vec();
/// map.insert("foo".as_bytes(), 1);
/// map.insert("bar".as_bytes(), 2);
/// map.insert("bazzoo".as_bytes(), 3);
///
/// let trie = ZeroTrie::try_from(&map)?;
///
/// assert_eq!(trie.get("foo"), Some(1));
/// assert_eq!(trie.get("bar"), Some(2));
/// assert_eq!(trie.get("bazzoo"), Some(3));
/// assert_eq!(trie.get("unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
// Note: The absence of the following derive does not cause any test failures in this crate
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
pub struct ZeroTrie<Store>(pub(crate) ZeroTrieFlavor<Store>);
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ZeroTrieFlavor<Store> {
SimpleAscii(ZeroTrieSimpleAscii<Store>),
PerfectHash(ZeroTriePerfectHash<Store>),
ExtendedCapacity(ZeroTrieExtendedCapacity<Store>),
}
/// A data structure that compactly maps from ASCII strings to integers.
///
/// For more information, see [`ZeroTrie`].
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"foo"[..], 1);
/// map.insert(b"bar", 2);
/// map.insert(b"bazzoo", 3);
///
/// let trie = ZeroTrieSimpleAscii::try_from(&map)?;
///
/// assert_eq!(trie.get(b"foo"), Some(1));
/// assert_eq!(trie.get(b"bar"), Some(2));
/// assert_eq!(trie.get(b"bazzoo"), Some(3));
/// assert_eq!(trie.get(b"unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
///
/// The trie can only store ASCII bytes; a string with non-ASCII always returns None:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// assert!(trie.get(b"ab\xFF").is_none());
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTrieSimpleAscii<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTrieSimpleAscii<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTrieSimpleAscii<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::SimpleAscii(self))
}
}
/// A data structure that compactly maps from ASCII strings to integers
/// in a case-insensitive way.
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"foo"[..], 1);
/// map.insert(b"Bar", 2);
/// map.insert(b"Bazzoo", 3);
///
/// let trie = ZeroAsciiIgnoreCaseTrie::try_from(&map)?;
///
/// assert_eq!(trie.get(b"foo"), Some(1));
/// assert_eq!(trie.get(b"bar"), Some(2));
/// assert_eq!(trie.get(b"BAR"), Some(2));
/// assert_eq!(trie.get(b"bazzoo"), Some(3));
/// assert_eq!(trie.get(b"unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
///
/// Strings with different cases of the same character at the same offset are not allowed:
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"bar"[..], 1);
/// // OK: 'r' and 'Z' are different letters
/// map.insert(b"baZ", 2);
/// // Bad: we already inserted 'r' so we cannot also insert 'R' at the same position
/// map.insert(b"baR", 2);
///
/// ZeroAsciiIgnoreCaseTrie::try_from(&map).expect_err("mixed-case strings!");
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroAsciiIgnoreCaseTrie<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroAsciiIgnoreCaseTrie<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
// Note: ZeroAsciiIgnoreCaseTrie is not a variant of ZeroTrie so there is no `into_zerotrie`
/// A data structure that compactly maps from byte strings to integers.
///
/// For more information, see [`ZeroTrie`].
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTriePerfectHash;
///
/// let mut map = LiteMap::<&[u8], usize>::new_vec();
/// map.insert("foo".as_bytes(), 1);
/// map.insert("bår".as_bytes(), 2);
/// map.insert("båzzøø".as_bytes(), 3);
///
/// let trie = ZeroTriePerfectHash::try_from(&map)?;
///
/// assert_eq!(trie.get("foo".as_bytes()), Some(1));
/// assert_eq!(trie.get("bår".as_bytes()), Some(2));
/// assert_eq!(trie.get("båzzøø".as_bytes()), Some(3));
/// assert_eq!(trie.get("bazzoo".as_bytes()), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTriePerfectHash<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTriePerfectHash<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTriePerfectHash<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::PerfectHash(self))
}
}
/// A data structure that maps from a large number of byte strings to integers.
///
/// For more information, see [`ZeroTrie`].
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTrieExtendedCapacity<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTrieExtendedCapacity<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTrieExtendedCapacity<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::ExtendedCapacity(self))
}
}
macro_rules! impl_zerotrie_subtype {
($name:ident, $iter_element:ty, $iter_fn:path, $iter_ty:ty, $cnv_fn:path) => {
impl<Store> $name<Store> {
/// Create a trie directly from a store.
///
/// If the store does not contain valid bytes, unexpected behavior may occur.
#[inline]
pub const fn from_store(store: Store) -> Self {
Self { store }
}
/// Takes the byte store from this trie.
#[inline]
pub fn into_store(self) -> Store {
self.store
}
/// Converts this trie's store to a different store implementing the `From` trait.
///
#[doc = concat!("For example, use this to change `", stringify!($name), "<Vec<u8>>` to `", stringify!($name), "<Cow<[u8]>>`.")]
///
/// # Examples
///
/// ```
/// use std::borrow::Cow;
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: ", stringify!($name), "<Vec<u8>> = ", stringify!($name), "::from_bytes(b\"abc\\x85\").to_owned();")]
#[doc = concat!("let cow: ", stringify!($name), "<Cow<[u8]>> = trie.convert_store();")]
///
/// assert_eq!(cow.get(b"abc"), Some(5));
/// ```
pub fn convert_store<X: From<Store>>(self) -> $name<X> {
$name::<X>::from_store(X::from(self.store))
}
}
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Queries the trie for a string.
// Note: We do not need the Borrow trait's guarantees, so we use
// the more general AsRef trait.
pub fn get<K>(&self, key: K) -> Option<usize> where K: AsRef<[u8]> {
reader::get_parameterized::<Self>(self.store.as_ref(), key.as_ref())
}
/// Returns `true` if the trie is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.store.as_ref().is_empty()
}
/// Returns the size of the trie in number of bytes.
///
/// To get the number of keys in the trie, use `.iter().count()`:
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
/// // A trie with two values: "abc" and "abcdef"
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x80def\\x81\");")]
///
/// assert_eq!(8, trie.byte_len());
/// assert_eq!(2, trie.iter().count());
/// ```
#[inline]
pub fn byte_len(&self) -> usize {
self.store.as_ref().len()
}
/// Returns the bytes contained in the underlying store.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.store.as_ref()
}
/// Returns this trie as a reference transparent over a byte slice.
#[inline]
pub fn as_borrowed(&self) -> &$name<[u8]> {
$name::from_bytes(self.store.as_ref())
}
/// Returns a trie with a store borrowing from this trie.
#[inline]
pub fn as_borrowed_slice(&self) -> $name<&[u8]> {
$name::from_store(self.store.as_ref())
}
}
impl<Store> AsRef<$name<[u8]>> for $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn as_ref(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
#[cfg(feature = "alloc")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Converts a possibly-borrowed $name to an owned one.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x85\");")]
#[doc = concat!("let owned: ", stringify!($name), "<Vec<u8>> = trie.to_owned();")]
///
/// assert_eq!(trie.get(b"abc"), Some(5));
/// assert_eq!(owned.get(b"abc"), Some(5));
/// ```
#[inline]
pub fn to_owned(&self) -> $name<Vec<u8>> {
$name::from_store(
Vec::from(self.store.as_ref()),
)
}
/// Returns an iterator over the key/value pairs in this trie.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
/// // A trie with two values: "abc" and "abcdef"
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x80def\\x81\");")]
///
/// let mut it = trie.iter();
/// assert_eq!(it.next(), Some(("abc".into(), 0)));
/// assert_eq!(it.next(), Some(("abcdef".into(), 1)));
/// assert_eq!(it.next(), None);
/// ```
#[inline]
pub fn iter(&self) -> $iter_ty {
$iter_fn(self.as_bytes())
}
}
impl $name<[u8]> {
/// Casts from a byte slice to a reference to a trie with the same lifetime.
///
/// If the bytes are not a valid trie, unexpected behavior may occur.
#[inline]
pub fn from_bytes(trie: &[u8]) -> &Self {
Self::transparent_ref_from_store(trie)
}
}
#[cfg(feature = "alloc")]
impl $name<Vec<u8>> {
pub(crate) fn try_from_tuple_slice(items: &[(&ByteStr, usize)]) -> Result<Self, ZeroTrieBuildError> {
use crate::options::ZeroTrieWithOptions;
ZeroTrieBuilder::<VecDeque<u8>>::from_sorted_tuple_slice(
items,
Self::OPTIONS,
)
.map(|s| Self {
store: s.to_bytes(),
})
}
}
#[cfg(feature = "alloc")]
impl<'a, K> FromIterator<(K, usize)> for $name<Vec<u8>>
where
K: AsRef<[u8]>
{
fn from_iter<T: IntoIterator<Item = (K, usize)>>(iter: T) -> Self {
use crate::options::ZeroTrieWithOptions;
use crate::builder::nonconst::ZeroTrieBuilder;
ZeroTrieBuilder::<VecDeque<u8>>::from_bytes_iter(
iter,
Self::OPTIONS
)
.map(|s| Self {
store: s.to_bytes(),
})
.unwrap()
}
}
#[cfg(feature = "alloc")]
impl<'a, K> TryFrom<&'a BTreeMap<K, usize>> for $name<Vec<u8>>
where
K: Borrow<[u8]>
{
type Error = crate::error::ZeroTrieBuildError;
fn try_from(map: &'a BTreeMap<K, usize>) -> Result<Self, Self::Error> {
let tuples: Vec<(&[u8], usize)> = map
.iter()
.map(|(k, v)| (k.borrow(), *v))
.collect();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples);
Self::try_from_tuple_slice(byte_str_slice)
}
}
#[cfg(feature = "alloc")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized
{
/// Exports the data from this ZeroTrie type into a BTreeMap.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
/// use std::collections::BTreeMap;
///
#[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")]
/// let items = trie.to_btreemap();
///
/// assert_eq!(items.len(), 2);
///
#[doc = concat!("let recovered_trie: ", stringify!($name), "<Vec<u8>> = items")]
/// .into_iter()
/// .collect();
/// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes());
/// ```
pub fn to_btreemap(&self) -> BTreeMap<$iter_element, usize> {
self.iter().collect()
}
#[allow(dead_code)] // not needed for ZeroAsciiIgnoreCaseTrie
pub(crate) fn to_btreemap_bytes(&self) -> BTreeMap<Box<[u8]>, usize> {
self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect()
}
}
#[cfg(feature = "alloc")]
impl<Store> From<&$name<Store>> for BTreeMap<$iter_element, usize>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn from(other: &$name<Store>) -> Self {
other.to_btreemap()
}
}
#[cfg(feature = "litemap")]
impl<'a, K, S> TryFrom<&'a LiteMap<K, usize, S>> for $name<Vec<u8>>
where
K: Borrow<[u8]>,
S: litemap::store::StoreIterable<'a, K, usize>,
{
type Error = crate::error::ZeroTrieBuildError;
fn try_from(map: &'a LiteMap<K, usize, S>) -> Result<Self, Self::Error> {
let tuples: Vec<(&[u8], usize)> = map
.iter()
.map(|(k, v)| (k.borrow(), *v))
.collect();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples);
Self::try_from_tuple_slice(byte_str_slice)
}
}
#[cfg(feature = "litemap")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Exports the data from this ZeroTrie type into a LiteMap.
///
/// ✨ *Enabled with the `litemap` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
/// use litemap::LiteMap;
///
#[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")]
///
/// let items = trie.to_litemap();
/// assert_eq!(items.len(), 2);
///
#[doc = concat!("let recovered_trie: ", stringify!($name), "<Vec<u8>> = items")]
/// .iter()
/// .map(|(k, v)| (k, *v))
/// .collect();
/// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes());
/// ```
pub fn to_litemap(&self) -> LiteMap<$iter_element, usize> {
self.iter().collect()
}
#[allow(dead_code)] // not needed for ZeroAsciiIgnoreCaseTrie
pub(crate) fn to_litemap_bytes(&self) -> LiteMap<Box<[u8]>, usize> {
self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect()
}
}
#[cfg(feature = "litemap")]
impl<Store> From<&$name<Store>> for LiteMap<$iter_element, usize>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn from(other: &$name<Store>) -> Self {
other.to_litemap()
}
}
#[cfg(feature = "litemap")]
impl $name<Vec<u8>>
{
#[cfg(feature = "serde")]
pub(crate) fn try_from_serde_litemap(items: &LiteMap<Box<ByteStr>, usize>) -> Result<Self, ZeroTrieBuildError> {
let lm_borrowed: LiteMap<&ByteStr, usize> = items.to_borrowed_keys();
Self::try_from_tuple_slice(lm_borrowed.as_slice())
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
impl Borrow<$name<[u8]>> for $name<&[u8]> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
#[cfg(feature = "alloc")]
impl Borrow<$name<[u8]>> for $name<Box<[u8]>> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
#[cfg(feature = "alloc")]
impl Borrow<$name<[u8]>> for $name<Vec<u8>> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
#[cfg(feature = "alloc")]
impl alloc::borrow::ToOwned for $name<[u8]> {
type Owned = $name<Box<[u8]>>;
#[doc = concat!("This impl allows [`", stringify!($name), "`] to be used inside of a [`Cow`](alloc::borrow::Cow).")]
///
#[doc = concat!("Note that it is also possible to use `", stringify!($name), "<ZeroVec<u8>>` for a similar result.")]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use std::borrow::Cow;
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: Cow<", stringify!($name), "<[u8]>> = Cow::Borrowed(", stringify!($name), "::from_bytes(b\"abc\\x85\"));")]
/// assert_eq!(trie.get(b"abc"), Some(5));
/// ```
fn to_owned(&self) -> Self::Owned {
let bytes: &[u8] = self.store.as_ref();
$name::from_store(
Vec::from(bytes).into_boxed_slice(),
)
}
}
// TODO(#2778): Auto-derive these impls based on the repr(transparent).
//
// Safety (based on the safety checklist on the VarULE trait):
// 1. `$name` does not include any uninitialized or padding bytes as it is `repr(transparent)`
// over a `VarULE` type, `Store`, as evidenced by the existence of `transparent_ref_from_store()`
// 2. `$name` is aligned to 1 byte for the same reason
// 3. The impl of `validate_bytes()` returns an error if any byte is not valid (passed down to `VarULE` impl of `Store`)
// 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (passed down to `VarULE` impl of `Store`)
// 5. The impl of `from_bytes_unchecked()` returns a reference to the same data.
// 6. `parse_bytes()` is left to its default impl
// 7. byte equality is semantic equality
#[cfg(feature = "zerovec")]
unsafe impl<Store> zerovec::ule::VarULE for $name<Store>
where
Store: zerovec::ule::VarULE,
{
#[inline]
fn validate_bytes(bytes: &[u8]) -> Result<(), zerovec::ule::UleError> {
Store::validate_bytes(bytes)
}
#[inline]
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
// Safety: we can pass down the validity invariant to Store
Self::transparent_ref_from_store(Store::from_bytes_unchecked(bytes))
}
}
#[cfg(feature = "zerofrom")]
impl<'zf, Store1, Store2> zerofrom::ZeroFrom<'zf, $name<Store1>> for $name<Store2>
where
Store2: zerofrom::ZeroFrom<'zf, Store1>,
{
#[inline]
fn zero_from(other: &'zf $name<Store1>) -> Self {
$name::from_store(zerofrom::ZeroFrom::zero_from(&other.store))
}
}
};
}
#[cfg(feature = "alloc")]
fn string_to_box_u8(input: String) -> Box<[u8]> {
input.into_boxed_str().into_boxed_bytes()
}
#[doc(hidden)] // subject to change
#[cfg(feature = "alloc")]
pub type ZeroTrieStringIterator<'a> =
core::iter::Map<reader::ZeroTrieIterator<'a>, fn((Vec<u8>, usize)) -> (String, usize)>;
impl_zerotrie_subtype!(
ZeroTrieSimpleAscii,
String,
reader::get_iter_ascii_or_panic,
ZeroTrieStringIterator<'_>,
string_to_box_u8
);
impl_zerotrie_subtype!(
ZeroAsciiIgnoreCaseTrie,
String,
reader::get_iter_ascii_or_panic,
ZeroTrieStringIterator<'_>,
string_to_box_u8
);
impl_zerotrie_subtype!(
ZeroTriePerfectHash,
Vec<u8>,
reader::get_iter_phf,
reader::ZeroTrieIterator<'_>,
Vec::into_boxed_slice
);
impl_zerotrie_subtype!(
ZeroTrieExtendedCapacity,
Vec<u8>,
reader::get_iter_phf,
reader::ZeroTrieIterator<'_>,
Vec::into_boxed_slice
);
macro_rules! impl_dispatch {
($self:ident, $inner_fn:ident()) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(),
}
};
($self:ident, $inner_fn:ident().into_zerotrie()) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn().into_zerotrie(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn().into_zerotrie(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn().into_zerotrie(),
}
};
(&$self:ident, $inner_fn:ident()) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(),
}
};
($self:ident, $inner_fn:ident($arg:ident)) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg),
}
};
(&$self:ident, $inner_fn:ident($arg:ident)) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg),
}
};
(&$self:ident, $trait:ident::$inner_fn:ident()) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => {
ZeroTrie(ZeroTrieFlavor::SimpleAscii($trait::$inner_fn(subtype)))
}
ZeroTrieFlavor::PerfectHash(subtype) => {
ZeroTrie(ZeroTrieFlavor::PerfectHash($trait::$inner_fn(subtype)))
}
ZeroTrieFlavor::ExtendedCapacity(subtype) => {
ZeroTrie(ZeroTrieFlavor::ExtendedCapacity($trait::$inner_fn(subtype)))
}
}
};
}
impl<Store> ZeroTrie<Store> {
/// Takes the byte store from this trie.
pub fn into_store(self) -> Store {
impl_dispatch!(self, into_store())
}
/// Converts this trie's store to a different store implementing the `From` trait.
///
/// For example, use this to change `ZeroTrie<Vec<u8>>` to `ZeroTrie<Cow<[u8]>>`.
pub fn convert_store<NewStore>(self) -> ZeroTrie<NewStore>
where
NewStore: From<Store>,
{
impl_dispatch!(self, convert_store().into_zerotrie())
}
}
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Queries the trie for a string.
pub fn get<K>(&self, key: K) -> Option<usize>
where
K: AsRef<[u8]>,
{
impl_dispatch!(&self, get(key))
}
/// Returns `true` if the trie is empty.
pub fn is_empty(&self) -> bool {
impl_dispatch!(&self, is_empty())
}
/// Returns the size of the trie in number of bytes.
///
/// To get the number of keys in the trie, use `.iter().count()`.
pub fn byte_len(&self) -> usize {
impl_dispatch!(&self, byte_len())
}
}
#[cfg(feature = "alloc")]
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Exports the data from this ZeroTrie into a BTreeMap.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
pub fn to_btreemap(&self) -> BTreeMap<Box<[u8]>, usize> {
impl_dispatch!(&self, to_btreemap_bytes())
}
}
#[cfg(feature = "litemap")]
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Exports the data from this ZeroTrie into a LiteMap.
pub fn to_litemap(&self) -> LiteMap<Box<[u8]>, usize> {
impl_dispatch!(&self, to_litemap_bytes())
}
}
#[cfg(feature = "alloc")]
impl ZeroTrie<Vec<u8>> {
pub(crate) fn try_from_tuple_slice(
items: &[(&ByteStr, usize)],
) -> Result<Self, ZeroTrieBuildError> {
let is_all_ascii = items.iter().all(|(s, _)| s.is_all_ascii());
if is_all_ascii && items.len() < 512 {
ZeroTrieSimpleAscii::try_from_tuple_slice(items).map(|x| x.into_zerotrie())
} else {
ZeroTriePerfectHash::try_from_tuple_slice(items).map(|x| x.into_zerotrie())
}
}
}
#[cfg(feature = "alloc")]
impl<K> FromIterator<(K, usize)> for ZeroTrie<Vec<u8>>
where
K: AsRef<[u8]>,
{
fn from_iter<T: IntoIterator<Item = (K, usize)>>(iter: T) -> Self {
// We need two Vecs because the first one anchors the `K`s that the second one borrows.
let items = Vec::from_iter(iter);
let mut items: Vec<(&[u8], usize)> = items.iter().map(|(k, v)| (k.as_ref(), *v)).collect();
items.sort();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&items);
#[expect(clippy::unwrap_used)] // FromIterator is panicky
Self::try_from_tuple_slice(byte_str_slice).unwrap()
}
}
#[cfg(feature = "databake")]
impl<Store> databake::Bake for ZeroTrie<Store>
where
Store: databake::Bake,
{
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
use databake::*;
let inner = impl_dispatch!(&self, bake(env));
quote! { #inner.into_zerotrie() }
}
}
#[cfg(feature = "databake")]
impl<Store> databake::BakeSize for ZeroTrie<Store>
where
Store: databake::BakeSize,
{
fn borrows_size(&self) -> usize {
impl_dispatch!(&self, borrows_size())
}
}
#[cfg(feature = "zerofrom")]
impl<'zf, Store1, Store2> zerofrom::ZeroFrom<'zf, ZeroTrie<Store1>> for ZeroTrie<Store2>
where
Store2: zerofrom::ZeroFrom<'zf, Store1>,
{
fn zero_from(other: &'zf ZeroTrie<Store1>) -> Self {
use zerofrom::ZeroFrom;
impl_dispatch!(&other, ZeroFrom::zero_from())
}
}