chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

1
vendor/zerotrie/.cargo-checksum.json vendored Normal file
View File

@@ -0,0 +1 @@
{"files":{".cargo_vcs_info.json":"e95d3df71480085b0d60cb0121e869c023259ad73cc13d707248cd99468483d2","Cargo.lock":"8b1744764722e5f57fb49907b9546885d49f93bc017700f2b074093095672bb3","Cargo.toml":"d446611694d0081007eb0c98ca3aceb5d11c4ff4abad72e540d5548c16e1586b","Cargo.toml.orig":"f1fe4cb435d65841903f7c22c340eee8f72e62d0a339e682316d460b067c35c3","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"21c5a3a4999c937473283001d787b27b7f92942a4cc9df99e63466346205e777","benches/overview.rs":"f1207ef0c53c743fbf61e1ba886dda70cdba338fb357e1f61ca9285cc6c5f49e","examples/first_weekday_for_region.rs":"2ee02fb39378be77ea897172c47e7cb2749a5a127a4efae101b30e6bb762ddfb","src/builder/branch_meta.rs":"31b306f321f10655a2efc2e395a2a65e6df6da8b1b71963772ef4407061a7eda","src/builder/bytestr.rs":"8c9040bbf4f5068d837dca6b1cac0323858fa3d87cb00fff2e14453bfc3e9d8b","src/builder/konst/builder.rs":"4c833abce8d9b95359f05f1de4435e97b351c9258f11114e8817c1d405bbb32f","src/builder/konst/mod.rs":"160d6d2b0da65101e87371aadcf30aaa96f5851a58a23794b11d596f384b635c","src/builder/konst/store.rs":"c8405e646a960227765148e4bc9b3e5bdca7d6854cf7ec9897e53408cbd6370d","src/builder/litemap.rs":"7c9b2f5012a4b5925795a3b6cfba6f97e653d13d506636a10ae7dea33ef8f36f","src/builder/mod.rs":"d30f23f5e328adadc3fad0dc09ebbb1eb143d78e9c36a52d48a2da8c77813775","src/builder/nonconst/builder.rs":"76b0bb1d3087d9a28e44196d31314033104d49ac5a27bfe38ea613ea1f4f68c2","src/builder/nonconst/mod.rs":"9f02f6a7ad30458d056568399b8640ce8332cedd8f88649b9c4e60254ba025a2","src/builder/nonconst/store.rs":"bb7e7a7f7b4d550da82e9c76f898474f6c1e7dd1b3e41b333c67ebed53ca1708","src/byte_phf/builder.rs":"1584d145ef10d0a5e295e35d351dc2bf2be0f5699b477d6d4b73f25a5407e1c3","src/byte_phf/cached_owned.rs":"2999ced75fba7aa8685fb1b1ee4c1dd47b6e36b55c9d5d6a10ed3aebc5a54b6b","src/byte_phf/mod.rs":"7b204592135e883804376c91cc235aafff7bbf2b1206ae47c4f98bae48234d0c","src/cursor.rs":"20fa25ce0a6aa49795db86d839fac764bcc998e701399f8bc5fefd107d739534","src/error.rs":"487cb8807ee4fecffa0bc4c87ad94eb9b609590a2af3c98df548e6cad1dd18d8","src/helpers.rs":"01bef8c1e2d6aaa1aec6b5ab34d1b42aef78e070c87425f6a6481d1a4c487acd","src/lib.rs":"e5e117f16aeb4e494f494834505e7d454f2ebf10a8400b1bcf6d18deb60e0866","src/options.rs":"aa980fa7c58a10cf128151d15f1a6186a356ce9aba9c75c02108ca268680fdc4","src/reader.rs":"50538ff8733a6780a12bd6b79290a5f461e86a0047848f9f1ee930f1d4b15ccc","src/serde.rs":"1bc0468844b6b51100dec2b4087afe3e202e7d7a5867b054a9244c19e3f42759","src/varint.rs":"db9b4d741622b464568a1c04b15b67a8b7c8676b306ae83726678bb9381806c3","src/zerotrie.rs":"4baa681caf321b42738e535c0b33384eefc40940e0d80073c1da41a2ef6ef64b","tests/asciitrie_test.rs":"8c7779de93c28e55eed3224b53b1089b3d995c55206eba567f9064de4de7c535","tests/builder_test.rs":"916143542d4086e97b8a10243671cba50ad2daca7cbe9ecb7db89420dae5f033","tests/data/data.rs":"f8a7afcf64e8bd0f902a1d2eb829deb96c0a0fddd78f0aa80657764a91460515","tests/derive_test.rs":"3b6f86d6a460e2f8179158d0dc149923a0b06d81b7d841ff091d7d9fd46ace06","tests/ignorecase_test.rs":"28dce3e892ddf0b2ccf254b60e0d915872d74b9eab0f80522d4e7e7fefbbdc7e","tests/locale_aux_test.rs":"14bc496ba210f289b1ea9cfee2ae6be47c279f65062505dd53cd1a7acce58fba"},"package":"2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"}

7
vendor/zerotrie/.cargo_vcs_info.json vendored Normal file
View File

@@ -0,0 +1,7 @@
{
"git": {
"sha1": "29dfe2790b6cfdab94ca6a6b69f58ce54802dbf7",
"dirty": true
},
"path_in_vcs": "utils/zerotrie"
}

196
vendor/zerotrie/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,196 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "databake"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
dependencies = [
"databake-derive",
"proc-macro2",
"quote",
]
[[package]]
name = "databake-derive"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
dependencies = [
"serde_core",
]
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
dependencies = [
"proc-macro2",
]
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "syn"
version = "2.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerotrie"
version = "0.2.3"
dependencies = [
"databake",
"displaydoc",
"litemap",
"serde_core",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"databake",
"serde",
"zerofrom",
]

151
vendor/zerotrie/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,151 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.82"
name = "zerotrie"
version = "0.2.3"
authors = ["The ICU4X Project Developers"]
build = false
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
"build.rs",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "A data structure that efficiently maps strings to integers"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[features]
alloc = []
databake = [
"dep:databake",
"zerovec?/databake",
]
default = []
litemap = [
"dep:litemap",
"alloc",
]
serde = [
"dep:serde_core",
"dep:litemap",
"alloc",
"litemap/serde",
"zerovec?/serde",
]
yoke = ["dep:yoke"]
zerofrom = ["dep:zerofrom"]
[lib]
name = "zerotrie"
path = "src/lib.rs"
bench = false
[[example]]
name = "first_weekday_for_region"
path = "examples/first_weekday_for_region.rs"
[[test]]
name = "asciitrie_test"
path = "tests/asciitrie_test.rs"
required-features = [
"alloc",
"litemap",
]
[[test]]
name = "builder_test"
path = "tests/builder_test.rs"
required-features = [
"alloc",
"litemap",
]
[[test]]
name = "derive_test"
path = "tests/derive_test.rs"
[[test]]
name = "ignorecase_test"
path = "tests/ignorecase_test.rs"
[[test]]
name = "locale_aux_test"
path = "tests/locale_aux_test.rs"
required-features = [
"alloc",
"litemap",
]
[[bench]]
name = "overview"
path = "benches/overview.rs"
harness = false
required-features = [
"alloc",
"litemap",
]
[dependencies.databake]
version = "0.2.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.displaydoc]
version = "0.2.3"
default-features = false
[dependencies.litemap]
version = "0.8.0"
features = ["alloc"]
optional = true
default-features = false
[dependencies.serde_core]
version = "1.0.220"
optional = true
default-features = false
[dependencies.yoke]
version = "0.8.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.zerofrom]
version = "0.1.3"
optional = true
default-features = false
[dependencies.zerovec]
version = "0.11.3"
optional = true
default-features = false

46
vendor/zerotrie/LICENSE vendored Normal file
View File

@@ -0,0 +1,46 @@
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.

41
vendor/zerotrie/README.md vendored Normal file
View File

@@ -0,0 +1,41 @@
# zerotrie [![crates.io](https://img.shields.io/crates/v/zerotrie)](https://crates.io/crates/zerotrie)
<!-- cargo-rdme start -->
A data structure offering zero-copy storage and retrieval of byte strings, with a focus
on the efficient storage of ASCII strings. Strings are mapped to `usize` values.
[`ZeroTrie`] does not support mutation because doing so would require recomputing the entire
data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`].
There are multiple variants of [`ZeroTrie`] optimized for different use cases.
## Examples
```rust
use zerotrie::ZeroTrie;
let data: &[(&str, usize)] = &[("abc", 11), ("xyz", 22), ("axyb", 33)];
let trie: ZeroTrie<Vec<u8>> = data.iter().copied().collect();
assert_eq!(trie.get("axyb"), Some(33));
assert_eq!(trie.byte_len(), 18);
```
## Internal Structure
To read about the internal structure of [`ZeroTrie`], build the docs with private modules:
```bash
cargo doc --document-private-items --all-features --no-deps --open
```
[`LiteMap`]: litemap::LiteMap
[`BTreeMap`]: alloc::collections::BTreeMap
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).

198
vendor/zerotrie/benches/overview.rs vendored Normal file
View File

@@ -0,0 +1,198 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use litemap::LiteMap;
use std::collections::HashMap;
use zerotrie::ZeroTrieExtendedCapacity;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroHashMap;
use zerovec::ZeroMap;
mod testdata {
include!("../tests/data/data.rs");
}
fn get_basic_bench(c: &mut Criterion) {
let mut g = c.benchmark_group("get/basic");
// NOTE: All the trie data are the same for basic data
let trie = testdata::basic::TRIE_ASCII;
let data = testdata::basic::DATA_ASCII;
g.bench_function("SimpleAscii", |b| {
let trie = ZeroTrieSimpleAscii::from_bytes(trie);
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&trie).get(key);
assert_eq!(Some(*expected), actual);
}
});
});
g.bench_function("PerfectHash", |b| {
let trie = ZeroTriePerfectHash::from_bytes(trie);
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&trie).get(key);
assert_eq!(Some(*expected), actual);
}
});
});
g.bench_function("ExtendedCapacity", |b| {
let trie = ZeroTrieExtendedCapacity::from_bytes(trie);
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&trie).get(key);
assert_eq!(Some(*expected), actual);
}
});
});
g.bench_function("ZeroMap/u32", |b| {
let zm: ZeroMap<[u8], u32> = data.iter().map(|(a, b)| (*a, *b as u32)).collect();
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&zm).get_copied(key);
assert_eq!(Some(*expected as u32), actual);
}
});
});
g.bench_function("ZeroMap/u8", |b| {
let zm: ZeroMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect();
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&zm).get_copied(key);
assert_eq!(Some(*expected as u8), actual);
}
});
});
g.bench_function("HashMap", |b| {
let hm: HashMap<&[u8], usize> = data.iter().copied().collect();
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&hm).get(key);
assert_eq!(Some(expected), actual);
}
});
});
g.bench_function("ZeroHashMap/u8", |b| {
let zhm: ZeroHashMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect();
b.iter(|| {
for (key, expected) in black_box(data) {
let actual = black_box(&zhm).get(key).copied();
assert_eq!(Some(*expected as u8), actual);
}
});
});
}
fn get_subtags_bench_medium(c: &mut Criterion) {
let g = c.benchmark_group("get/subtags_10pct");
let strings = testdata::short_subtags_10pct::STRINGS;
let litemap = testdata::strings_to_litemap(strings);
get_subtags_bench_helper(g, strings, litemap);
}
fn get_subtags_bench_large(c: &mut Criterion) {
let g = c.benchmark_group("get/subtags_full");
let strings = testdata::short_subtags::STRINGS;
let litemap = testdata::strings_to_litemap(strings);
get_subtags_bench_helper(g, strings, litemap);
}
fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(
mut g: criterion::BenchmarkGroup<M>,
strings: &[&str],
litemap: LiteMap<&[u8], usize>,
) {
g.bench_function("SimpleAscii", |b| {
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&trie).get(key);
assert_eq!(Some(i), actual);
}
});
});
g.bench_function("PerfectHash", |b| {
let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&trie).get(key);
assert_eq!(Some(i), actual);
}
});
});
g.bench_function("ExtendedCapacity", |b| {
let trie = ZeroTrieExtendedCapacity::try_from(&litemap).unwrap();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&trie).get(key);
assert_eq!(Some(i), actual);
}
});
});
g.bench_function("ZeroMap/u32", |b| {
let zm: ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
assert_eq!(Some(i as u32), actual);
}
});
});
g.bench_function("ZeroMap/u8", |b| {
let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
assert_eq!(Some(i as u8), actual);
}
});
});
g.bench_function("HashMap", |b| {
let hm: HashMap<&[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&hm).get(key.as_bytes());
assert_eq!(Some(i as u32), actual.copied());
}
});
});
g.bench_function("ZeroHashMap/u8", |b| {
let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zhm).get(key.as_bytes()).copied();
assert_eq!(Some(i as u8), actual);
}
});
});
g.finish();
}
criterion_group!(
benches,
get_basic_bench,
get_subtags_bench_medium,
get_subtags_bench_large
);
criterion_main!(benches);

View File

@@ -0,0 +1,219 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
// This example demonstrates the use of ZeroTrieSimpleAscii to look up data based on a region code.
#![allow(dead_code)]
#![no_main] // https://github.com/unicode-org/icu4x/issues/395
icu_benchmark_macros::instrument!();
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
mod weekday {
pub const MON: usize = 1;
pub const FRI: usize = 5;
pub const SAT: usize = 6;
pub const SUN: usize = 7;
}
// This data originated from CLDR 41.
static DATA: &[(&str, usize)] = &[
("001", weekday::MON),
("AD", weekday::MON),
("AE", weekday::SAT),
("AF", weekday::SAT),
("AG", weekday::SUN),
("AI", weekday::MON),
("AL", weekday::MON),
("AM", weekday::MON),
("AN", weekday::MON),
("AR", weekday::MON),
("AS", weekday::SUN),
("AT", weekday::MON),
("AU", weekday::MON),
("AX", weekday::MON),
("AZ", weekday::MON),
("BA", weekday::MON),
("BD", weekday::SUN),
("BE", weekday::MON),
("BG", weekday::MON),
("BH", weekday::SAT),
("BM", weekday::MON),
("BN", weekday::MON),
("BR", weekday::SUN),
("BS", weekday::SUN),
("BT", weekday::SUN),
("BW", weekday::SUN),
("BY", weekday::MON),
("BZ", weekday::SUN),
("CA", weekday::SUN),
("CH", weekday::MON),
("CL", weekday::MON),
("CM", weekday::MON),
("CN", weekday::SUN),
("CO", weekday::SUN),
("CR", weekday::MON),
("CY", weekday::MON),
("CZ", weekday::MON),
("DE", weekday::MON),
("DJ", weekday::SAT),
("DK", weekday::MON),
("DM", weekday::SUN),
("DO", weekday::SUN),
("DZ", weekday::SAT),
("EC", weekday::MON),
("EE", weekday::MON),
("EG", weekday::SAT),
("ES", weekday::MON),
("ET", weekday::SUN),
("FI", weekday::MON),
("FJ", weekday::MON),
("FO", weekday::MON),
("FR", weekday::MON),
("GB", weekday::MON),
("GB-alt-variant", weekday::SUN),
("GE", weekday::MON),
("GF", weekday::MON),
("GP", weekday::MON),
("GR", weekday::MON),
("GT", weekday::SUN),
("GU", weekday::SUN),
("HK", weekday::SUN),
("HN", weekday::SUN),
("HR", weekday::MON),
("HU", weekday::MON),
("ID", weekday::SUN),
("IE", weekday::MON),
("IL", weekday::SUN),
("IN", weekday::SUN),
("IQ", weekday::SAT),
("IR", weekday::SAT),
("IS", weekday::MON),
("IT", weekday::MON),
("JM", weekday::SUN),
("JO", weekday::SAT),
("JP", weekday::SUN),
("KE", weekday::SUN),
("KG", weekday::MON),
("KH", weekday::SUN),
("KR", weekday::SUN),
("KW", weekday::SAT),
("KZ", weekday::MON),
("LA", weekday::SUN),
("LB", weekday::MON),
("LI", weekday::MON),
("LK", weekday::MON),
("LT", weekday::MON),
("LU", weekday::MON),
("LV", weekday::MON),
("LY", weekday::SAT),
("MC", weekday::MON),
("MD", weekday::MON),
("ME", weekday::MON),
("MH", weekday::SUN),
("MK", weekday::MON),
("MM", weekday::SUN),
("MN", weekday::MON),
("MO", weekday::SUN),
("MQ", weekday::MON),
("MT", weekday::SUN),
("MV", weekday::FRI),
("MX", weekday::SUN),
("MY", weekday::MON),
("MZ", weekday::SUN),
("NI", weekday::SUN),
("NL", weekday::MON),
("NO", weekday::MON),
("NP", weekday::SUN),
("NZ", weekday::MON),
("OM", weekday::SAT),
("PA", weekday::SUN),
("PE", weekday::SUN),
("PH", weekday::SUN),
("PK", weekday::SUN),
("PL", weekday::MON),
("PR", weekday::SUN),
("PT", weekday::SUN),
("PY", weekday::SUN),
("QA", weekday::SAT),
("RE", weekday::MON),
("RO", weekday::MON),
("RS", weekday::MON),
("RU", weekday::MON),
("SA", weekday::SUN),
("SD", weekday::SAT),
("SE", weekday::MON),
("SG", weekday::SUN),
("SI", weekday::MON),
("SK", weekday::MON),
("SM", weekday::MON),
("SV", weekday::SUN),
("SY", weekday::SAT),
("TH", weekday::SUN),
("TJ", weekday::MON),
("TM", weekday::MON),
("TR", weekday::MON),
("TT", weekday::SUN),
("TW", weekday::SUN),
("UA", weekday::MON),
("UM", weekday::SUN),
("US", weekday::SUN),
("UY", weekday::MON),
("UZ", weekday::MON),
("VA", weekday::MON),
("VE", weekday::SUN),
("VI", weekday::SUN),
("VN", weekday::MON),
("WS", weekday::SUN),
("XK", weekday::MON),
("YE", weekday::SUN),
("ZA", weekday::SUN),
("ZW", weekday::SUN),
];
static TRIE: ZeroTrieSimpleAscii<[u8; 539]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(DATA);
static TRIE_PHF: ZeroTriePerfectHash<[u8; 567]> = ZeroTriePerfectHash::from_store([
225, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 15, 0,
81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 79, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 74, 48, 76,
78, 77, 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
14, 41, 59, 74, 86, 88, 90, 92, 98, 100, 142, 181, 208, 226, 241, 253, 31, 43, 67, 85, 94, 97,
121, 136, 178, 65, 134, 196, 69, 79, 83, 85, 1, 2, 3, 129, 129, 129, 129, 201, 65, 68, 69, 71,
73, 75, 77, 86, 89, 1, 2, 3, 4, 5, 6, 7, 8, 135, 134, 129, 135, 129, 129, 129, 135, 134, 198,
72, 74, 77, 82, 84, 87, 1, 2, 3, 4, 5, 135, 129, 129, 129, 135, 135, 197, 65, 77, 83, 89, 90,
1, 2, 3, 4, 129, 135, 135, 129, 129, 196, 65, 69, 73, 78, 1, 2, 3, 129, 135, 135, 129, 83, 135,
75, 129, 69, 135, 194, 65, 87, 1, 135, 135, 77, 134, 206, 68, 69, 70, 71, 73, 76, 77, 78, 82,
83, 84, 85, 88, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 129, 134, 134, 135, 129, 129,
129, 129, 129, 135, 129, 129, 129, 129, 205, 65, 68, 69, 71, 72, 77, 78, 82, 83, 84, 87, 89,
90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 129, 135, 129, 129, 134, 129, 129, 135, 135, 135,
135, 129, 135, 201, 65, 72, 76, 77, 78, 79, 82, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 135, 129, 129,
129, 135, 135, 129, 129, 129, 198, 69, 74, 75, 77, 79, 90, 1, 2, 3, 4, 5, 129, 134, 129, 135,
135, 134, 197, 67, 69, 71, 83, 84, 1, 2, 3, 4, 129, 129, 134, 129, 135, 196, 73, 74, 79, 82, 1,
2, 3, 129, 129, 129, 129, 199, 66, 69, 70, 80, 82, 84, 85, 14, 15, 16, 17, 18, 19, 129, 45, 97,
108, 116, 45, 118, 97, 114, 105, 97, 110, 116, 135, 129, 129, 129, 129, 135, 135, 196, 75, 78,
82, 85, 1, 2, 3, 135, 135, 129, 129, 200, 68, 69, 76, 78, 81, 82, 83, 84, 1, 2, 3, 4, 5, 6, 7,
135, 129, 135, 135, 134, 134, 129, 129, 198, 69, 71, 72, 82, 87, 90, 1, 2, 3, 4, 5, 135, 129,
135, 135, 134, 129, 195, 77, 79, 80, 1, 2, 135, 134, 135, 48, 49, 129, 200, 65, 66, 73, 75, 84,
85, 86, 89, 1, 2, 3, 4, 5, 6, 7, 135, 129, 129, 129, 129, 129, 129, 134, 197, 73, 76, 79, 80,
90, 1, 2, 3, 4, 135, 129, 129, 135, 129, 206, 67, 68, 69, 72, 75, 77, 78, 79, 81, 84, 86, 88,
89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 129, 129, 129, 135, 129, 135, 129, 135, 129,
135, 133, 135, 129, 135, 200, 65, 69, 72, 75, 76, 82, 84, 89, 1, 2, 3, 4, 5, 6, 7, 135, 135,
135, 135, 129, 135, 135, 135,
]);
fn black_box<T>(dummy: T) -> T {
unsafe {
let ret = std::ptr::read_volatile(&dummy);
std::mem::forget(dummy);
ret
}
}
fn main() {
// Un-comment to re-generate the bytes (printed to the terminal)
// let trie_phf = DATA.iter().copied().collect::<ZeroTriePerfectHash<Vec<_>>>();
// assert_eq!(trie_phf.as_bytes(), TRIE_PHF.as_bytes());
assert_eq!(black_box(TRIE_PHF).get(b"MV"), Some(weekday::FRI));
}

View File

@@ -0,0 +1,29 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
/// Intermediate metadata for a branch node under construction.
#[derive(Debug, Clone, Copy)]
pub(crate) struct BranchMeta {
/// The lead byte for this branch. Formerly it was required to be an ASCII byte, but now
/// it can be any byte.
pub ascii: u8,
/// The size in bytes of the trie data reachable from this branch.
pub local_length: usize,
/// The size in bytes of this and all later sibling branches.
pub cumulative_length: usize,
/// The number of later sibling branches, including this.
pub count: usize,
}
impl BranchMeta {
/// Creates a new empty [`BranchMeta`].
pub const fn default() -> Self {
BranchMeta {
ascii: 0,
cumulative_length: 0,
local_length: 0,
count: 0,
}
}
}

128
vendor/zerotrie/src/builder/bytestr.rs vendored Normal file
View File

@@ -0,0 +1,128 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::borrow::Borrow;
#[cfg(feature = "serde")]
use alloc::boxed::Box;
/// A struct transparent over `[u8]` with convenient helper functions.
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct ByteStr([u8]);
impl ByteStr {
pub const fn from_byte_slice_with_value<'a, 'l>(
input: &'l [(&'a [u8], usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
pub const fn from_str_slice_with_value<'a, 'l>(
input: &'l [(&'a str, usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: str and ByteStr have the same layout, and ByteStr is less restrictive
unsafe { core::mem::transmute(input) }
}
pub fn from_bytes(input: &[u8]) -> &Self {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
#[cfg(feature = "serde")]
pub fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}
#[allow(dead_code)] // may want this in the future
pub fn from_str(input: &str) -> &Self {
Self::from_bytes(input.as_bytes())
}
#[allow(dead_code)] // may want this in the future
pub fn empty() -> &'static Self {
Self::from_bytes(&[])
}
#[allow(dead_code)] // not used in all features
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}
pub const fn len(&self) -> usize {
self.0.len()
}
#[allow(dead_code)] // not used in all features
pub fn is_all_ascii(&self) -> bool {
for byte in self.0.iter() {
if !byte.is_ascii() {
return false;
}
}
true
}
#[allow(dead_code)] // may want this in the future
pub(crate) fn byte_at(&self, index: usize) -> Option<u8> {
self.0.get(index).copied()
}
/// Returns the byte at the given index, panicking if out of bounds.
#[allow(clippy::indexing_slicing)] // "panic" is in method name
pub(crate) const fn byte_at_or_panic(&self, index: usize) -> u8 {
self.0[index]
}
/// Const function to evaluate `self < other`.
#[allow(clippy::indexing_slicing)] // in-range loop conditions
pub(crate) const fn is_less_then(&self, other: &Self) -> bool {
let mut i = 0;
while i < self.len() && i < other.len() {
if self.0[i] < other.0[i] {
return true;
}
if self.0[i] > other.0[i] {
return false;
}
i += 1;
}
self.len() < other.len()
}
/// Const function to evaluate `self[..prefix_len] == other[..prefix_len]`
///
/// # Panics
///
/// Panics if `prefix_len` is longer than either this string or the other string
#[allow(clippy::indexing_slicing)] // in-range loop conditions
pub(crate) const fn prefix_eq(&self, other: &ByteStr, prefix_len: usize) -> bool {
assert!(prefix_len <= self.len());
assert!(prefix_len <= other.len());
let mut i = 0;
while i < prefix_len {
if self.0[i] != other.0[i] {
return false;
}
i += 1;
}
true
}
}
impl Borrow<[u8]> for ByteStr {
fn borrow(&self) -> &[u8] {
self.as_bytes()
}
}
#[cfg(feature = "alloc")]
impl Borrow<[u8]> for alloc::boxed::Box<ByteStr> {
fn borrow(&self) -> &[u8] {
self.as_bytes()
}
}

View File

@@ -0,0 +1,338 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::super::branch_meta::BranchMeta;
use super::super::bytestr::ByteStr;
use super::store::const_for_each;
use super::store::ConstArrayBuilder;
use super::store::ConstLengthsStack;
use super::store::ConstSlice;
use crate::error::ZeroTrieBuildError;
use crate::varint;
/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
///
/// All methods that grow the trie will panic if the capacity N is not enough.
pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
data: ConstArrayBuilder<N, u8>,
}
impl<const N: usize> ZeroTrieBuilderConst<N> {
/// Non-const function that returns the current trie data as a slice.
#[cfg(feature = "litemap")]
pub fn as_bytes(&self) -> &[u8] {
self.data.as_const_slice().as_slice()
}
/// Returns the trie data, panicking if the buffer is the wrong size.
pub const fn build_or_panic(self) -> [u8; N] {
self.data.const_build_or_panic()
}
/// Creates a new empty builder.
pub const fn new() -> Self {
Self {
data: ConstArrayBuilder::new_empty([0; N], N),
}
}
/// Prepends an ASCII node to the front of the builder. Returns the new builder
/// and the delta in length, which is always 1.
#[must_use]
const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
if ascii >= 128 {
panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
}
let data = self.data.const_push_front_or_panic(ascii);
(Self { data }, 1)
}
/// Prepends a value node to the front of the builder. Returns the new builder
/// and the delta in length, which depends on the size of the varint.
#[must_use]
const fn prepend_value(self, value: usize) -> (Self, usize) {
let mut data = self.data;
let varint_array = varint::write_varint_meta3(value);
// Can panic (as documented in class docs):
data = data.const_extend_front_or_panic(varint_array.as_const_slice());
// Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
data = data.const_bitor_assign_or_panic(0, 0b10000000);
(Self { data }, varint_array.len())
}
/// Prepends a branch node to the front of the builder. Returns the new builder
/// and the delta in length, which depends on the size of the varint.
#[must_use]
const fn prepend_branch(self, value: usize) -> (Self, usize) {
let mut data = self.data;
let varint_array = varint::write_varint_meta2(value);
// Can panic (as documented in type-level docs):
data = data.const_extend_front_or_panic(varint_array.as_const_slice());
// Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
data = data.const_bitor_assign_or_panic(0, 0b11000000);
(Self { data }, varint_array.len())
}
/// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
/// and the delta in length, which is the length of the slice.
#[must_use]
const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
let mut data = self.data;
let mut i = s.len();
while i > 0 {
// Can panic (as documented in type-level docs):
data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
i -= 1;
}
(Self { data }, s.len())
}
/// Prepends multiple zeros to the front of the builder. Returns the new builder.
#[must_use]
const fn prepend_n_zeros(self, n: usize) -> Self {
let mut data = self.data;
let mut i = 0;
while i < n {
// Can panic (as documented in type-level docs):
data = data.const_push_front_or_panic(0);
i += 1;
}
Self { data }
}
/// Performs the operation `self[index] |= bits`
const fn bitor_assign_at_or_panic(self, index: usize, bits: u8) -> Self {
let mut data = self.data;
data = data.const_bitor_assign_or_panic(index, bits);
Self { data }
}
/// Creates a new builder containing the elements in the given slice of key/value pairs.
///
/// `K` is the stack size of the lengths stack. If you get an error such as
/// "AsciiTrie Builder: Need more stack", try increasing `K`.
///
/// # Panics
///
/// Panics if the items are not sorted
pub const fn from_tuple_slice<'a, const K: usize>(
items: &[(&'a ByteStr, usize)],
) -> Result<Self, ZeroTrieBuildError> {
let items = ConstSlice::from_slice(items);
let mut prev: Option<&'a ByteStr> = None;
const_for_each!(items, (ascii_str, _), {
match prev {
None => (),
Some(prev) => {
if !prev.is_less_then(ascii_str) {
panic!("Strings in ByteStr constructor are not sorted");
}
}
};
prev = Some(ascii_str)
});
Self::from_sorted_const_tuple_slice::<K>(items)
}
/// Creates a new builder containing the elements in the given slice of key/value pairs.
///
/// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
///
/// `K` is the stack size of the lengths stack. If you get an error such as
/// "AsciiTrie Builder: Need more stack", try increasing `K`.
pub const fn from_sorted_const_tuple_slice<const K: usize>(
items: ConstSlice<(&ByteStr, usize)>,
) -> Result<Self, ZeroTrieBuildError> {
let mut result = Self::new();
let total_size;
(result, total_size) = result.create_or_panic::<K>(items);
debug_assert!(total_size == result.data.len());
Ok(result)
}
/// The actual builder algorithm. For an explanation, see [`crate::builder`].
#[must_use]
const fn create_or_panic<const K: usize>(
mut self,
all_items: ConstSlice<(&ByteStr, usize)>,
) -> (Self, usize) {
let mut prefix_len = match all_items.last() {
Some(x) => x.0.len(),
// Empty slice:
None => return (Self::new(), 0),
};
// Initialize the main loop to point at the last string.
let mut lengths_stack = ConstLengthsStack::<K>::new();
let mut i = all_items.len() - 1;
let mut j = all_items.len();
let mut current_len = 0;
// Start the main loop.
loop {
let item_i = all_items.get_or_panic(i);
let item_j = all_items.get_or_panic(j - 1);
debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
// Check if we need to add a value node here.
if item_i.0.len() == prefix_len {
let len;
(self, len) = self.prepend_value(item_i.1);
current_len += len;
}
if prefix_len == 0 {
// All done! Leave the main loop.
break;
}
// Reduce the prefix length by 1 and recalculate i and j.
prefix_len -= 1;
let mut new_i = i;
let mut new_j = j;
let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
debug_assert!(ascii_i == ascii_j);
let key_ascii = ascii_i;
loop {
if new_i == 0 {
break;
}
let candidate = all_items.get_or_panic(new_i - 1).0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_i.0.prefix_eq(candidate, prefix_len) {
new_i -= 1;
} else {
break;
}
if candidate.len() == prefix_len {
// A string that equals the prefix does not take part in the branch node.
break;
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_i {
ascii_i = candidate;
}
}
loop {
if new_j == all_items.len() {
break;
}
let candidate = all_items.get_or_panic(new_j).0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_j.0.prefix_eq(candidate, prefix_len) {
new_j += 1;
} else {
break;
}
if candidate.len() == prefix_len {
panic!("A shorter string should be earlier in the sequence");
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_j {
ascii_j = candidate;
}
}
// If there are no different bytes at this prefix level, we can add an ASCII or Span
// node and then continue to the next iteration of the main loop.
if ascii_i == key_ascii && ascii_j == key_ascii {
let len;
(self, len) = self.prepend_ascii(ascii_i);
current_len += len;
debug_assert!(i == new_i || i == new_i + 1);
i = new_i;
debug_assert!(j == new_j);
continue;
}
// If i and j changed, we are a target of a branch node.
if ascii_j == key_ascii {
// We are the _last_ target of a branch node.
lengths_stack = lengths_stack.push_or_panic(BranchMeta {
ascii: key_ascii,
cumulative_length: current_len,
local_length: current_len,
count: 1,
});
} else {
// We are the _not the last_ target of a branch node.
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
lengths_stack = lengths_stack.push_or_panic(BranchMeta {
ascii: key_ascii,
cumulative_length: cumulative_length + current_len,
local_length: current_len,
count: count + 1,
});
}
if ascii_i != key_ascii {
// We are _not the first_ target of a branch node.
// Set the cursor to the previous string and continue the loop.
j = i;
i -= 1;
prefix_len = all_items.get_or_panic(i).0.len();
current_len = 0;
continue;
}
// Branch (first)
let (total_length, total_count) = {
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
(cumulative_length, count)
};
let branch_metas;
(lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
let original_keys = branch_metas.map_to_ascii_bytes();
// Write out the offset table
current_len = total_length;
const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
if w > 3 {
panic!("ZeroTrie capacity exceeded");
}
let mut k = 0;
while k <= w {
self = self.prepend_n_zeros(total_count - 1);
current_len += total_count - 1;
let mut l = 0;
let mut length_to_write = 0;
while l < total_count {
let BranchMeta { local_length, .. } = *branch_metas
.as_const_slice()
.get_or_panic(total_count - l - 1);
let mut adjusted_length = length_to_write;
let mut m = 0;
while m < k {
adjusted_length >>= 8;
m += 1;
}
if l > 0 {
self = self.bitor_assign_at_or_panic(l - 1, adjusted_length as u8);
}
l += 1;
length_to_write += local_length;
}
k += 1;
}
// Write out the lookup table
assert!(0 < total_count && total_count <= 256);
let branch_value = (w << 8) + (total_count & 0xff);
let slice_len;
(self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
let branch_len;
(self, branch_len) = self.prepend_branch(branch_value);
current_len += slice_len + branch_len;
i = new_i;
j = new_j;
}
assert!(lengths_stack.is_empty());
(self, current_len)
}
}

View File

@@ -0,0 +1,9 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod builder;
mod store;
pub(crate) use builder::*;
pub(crate) use store::ConstArrayBuilder;

View File

@@ -0,0 +1,352 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains internal collections for the const builder.
use super::super::branch_meta::BranchMeta;
/// A const-friendly slice type. It is backed by a full slice but is primarily intended
/// to represent subslices of the full slice. We need this only because we can't take
/// subslices in const Rust.
#[derive(Debug, Copy, Clone)]
pub(crate) struct ConstSlice<'a, T> {
/// The full slice.
full_slice: &'a [T],
/// The start index of the slice represented by this [`ConstSlice`].
start: usize,
/// The non-inclusive end index of the slice represented by this [`ConstSlice`].
limit: usize,
}
impl<'a, T> ConstSlice<'a, T> {
/// Creates a [`ConstSlice`] representing an entire slice.
pub const fn from_slice(other: &'a [T]) -> Self {
ConstSlice {
full_slice: other,
start: 0,
limit: other.len(),
}
}
/// Creates a [`ConstSlice`] with the given start and limit.
pub const fn from_manual_slice(full_slice: &'a [T], start: usize, limit: usize) -> Self {
ConstSlice {
full_slice,
start,
limit,
}
}
/// Returns the length of the [`ConstSlice`].
pub const fn len(&self) -> usize {
self.limit - self.start
}
/// Gets the element at `index`, panicking if not present.
pub const fn get_or_panic(&self, index: usize) -> &T {
#[allow(clippy::indexing_slicing)] // documented
&self.full_slice[index + self.start]
}
/// Gets the first element or `None` if empty.
#[cfg(test)]
pub const fn first(&self) -> Option<&T> {
if self.len() == 0 {
None
} else {
// Won't panic: we already handled an empty slice
Some(self.get_or_panic(0))
}
}
/// Gets the last element or `None` if empty.
pub const fn last(&self) -> Option<&T> {
if self.len() == 0 {
None
} else {
// Won't panic: we already handled an empty slice
Some(self.get_or_panic(self.len() - 1))
}
}
/// Gets a subslice of this slice.
#[cfg(test)]
pub const fn get_subslice_or_panic(
&self,
new_start: usize,
new_limit: usize,
) -> ConstSlice<'a, T> {
assert!(new_start <= new_limit);
assert!(new_limit <= self.len());
ConstSlice {
full_slice: self.full_slice,
start: self.start + new_start,
limit: self.start + new_limit,
}
}
/// Non-const function that returns this [`ConstSlice`] as a regular slice.
#[cfg(any(test, feature = "alloc"))]
#[allow(clippy::indexing_slicing)] // indices in range by struct invariant
pub fn as_slice(&self) -> &'a [T] {
&self.full_slice[self.start..self.limit]
}
}
impl<'a, T> From<&'a [T]> for ConstSlice<'a, T> {
fn from(other: &'a [T]) -> Self {
Self::from_slice(other)
}
}
/// A const-friendly mutable data structure backed by an array.
#[derive(Debug, Copy, Clone)]
pub(crate) struct ConstArrayBuilder<const N: usize, T> {
full_array: [T; N],
start: usize,
limit: usize,
}
impl<const N: usize, T: Default> Default for ConstArrayBuilder<N, T> {
fn default() -> Self {
Self::new_empty([(); N].map(|_| Default::default()), 0)
}
}
impl<const N: usize, T> ConstArrayBuilder<N, T> {
/// Creates a new, empty builder of the given size. `cursor` indicates where in the
/// array new elements will be inserted first. Since we use a lot of prepend operations,
/// it is common to set `cursor` to `N`.
pub const fn new_empty(full_array: [T; N], cursor: usize) -> Self {
assert!(cursor <= N);
Self {
full_array,
start: cursor,
limit: cursor,
}
}
/// Creates a new builder with some initial content in `[start, limit)`.
pub const fn from_manual_slice(full_array: [T; N], start: usize, limit: usize) -> Self {
assert!(start <= limit);
assert!(limit <= N);
Self {
full_array,
start,
limit,
}
}
/// Returns the number of initialized elements in the builder.
pub const fn len(&self) -> usize {
self.limit - self.start
}
/// Whether there are no initialized elements in the builder.
#[allow(dead_code)]
pub const fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the initialized elements as a [`ConstSlice`].
pub const fn as_const_slice(&self) -> ConstSlice<'_, T> {
ConstSlice::from_manual_slice(&self.full_array, self.start, self.limit)
}
/// Non-const function that returns a slice of the initialized elements.
#[cfg(any(test, feature = "alloc"))]
pub fn as_slice(&self) -> &[T] {
&self.full_array[self.start..self.limit]
}
}
// Certain functions that involve dropping `T` require that it be `Copy`
impl<const N: usize, T: Copy> ConstArrayBuilder<N, T> {
/// Takes a fully initialized builder as an array. Panics if the builder is not
/// fully initialized.
pub const fn const_build_or_panic(self) -> [T; N] {
if self.start != 0 || self.limit != N {
let actual_len = self.limit - self.start;
const PREFIX: &[u8; 31] = b"Buffer too large. Size needed: ";
let len_bytes: [u8; PREFIX.len() + crate::helpers::MAX_USIZE_LEN_AS_DIGITS] =
crate::helpers::const_fmt_int(*PREFIX, actual_len);
let Ok(len_str) = core::str::from_utf8(&len_bytes) else {
unreachable!()
};
panic!("{}", len_str);
}
self.full_array
}
/// Prepends an element to the front of the builder, panicking if there is no room.
#[allow(clippy::indexing_slicing)] // documented
pub const fn const_push_front_or_panic(mut self, value: T) -> Self {
if self.start == 0 {
panic!("Buffer too small");
}
self.start -= 1;
self.full_array[self.start] = value;
self
}
/// Prepends multiple elements to the front of the builder, panicking if there is no room.
#[allow(clippy::indexing_slicing)] // documented
pub const fn const_extend_front_or_panic(mut self, other: ConstSlice<T>) -> Self {
if self.start < other.len() {
panic!("Buffer too small");
}
self.start -= other.len();
let mut i = self.start;
const_for_each!(other, byte, {
self.full_array[i] = *byte;
i += 1;
});
self
}
}
impl<const N: usize> ConstArrayBuilder<N, u8> {
/// Specialized function that performs `self[index] |= bits`
#[allow(clippy::indexing_slicing)] // documented
pub(crate) const fn const_bitor_assign_or_panic(mut self, index: usize, bits: u8) -> Self {
self.full_array[self.start + index] |= bits;
self
}
}
impl<const N: usize, T: Copy> ConstArrayBuilder<N, T> {
/// Swaps the elements at positions `i` and `j`.
#[cfg(feature = "alloc")]
pub fn swap_or_panic(mut self, i: usize, j: usize) -> Self {
self.full_array.swap(self.start + i, self.start + j);
self
}
}
/// Evaluates a block over each element of a const slice. Takes three arguments:
///
/// 1. Expression that resolves to the [`ConstSlice`].
/// 2. Token that will be assigned the value of the element.
/// 3. Block to evaluate for each element.
macro_rules! const_for_each {
($safe_const_slice:expr, $item:tt, $inner:expr) => {{
let mut i = 0;
while i < $safe_const_slice.len() {
// Won't panic: in-range loop condition
let $item = $safe_const_slice.get_or_panic(i);
$inner;
i += 1;
}
}};
}
pub(crate) use const_for_each;
/// A data structure that holds up to K [`BranchMeta`] items.
///
/// Note: It should be possible to store the required data in the builder buffer itself,
/// which would eliminate the need for this helper struct and the limit it imposes.
pub(crate) struct ConstLengthsStack<const K: usize> {
data: [Option<BranchMeta>; K],
idx: usize,
}
impl<const K: usize> core::fmt::Debug for ConstLengthsStack<K> {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.as_slice().fmt(f)
}
}
impl<const K: usize> ConstLengthsStack<K> {
/// Creates a new empty [`ConstLengthsStack`].
pub const fn new() -> Self {
Self {
data: [None; K],
idx: 0,
}
}
/// Returns whether the stack is empty.
pub const fn is_empty(&self) -> bool {
self.idx == 0
}
/// Adds a [`BranchMeta`] to the stack, panicking if there is no room.
#[must_use]
#[allow(clippy::indexing_slicing)] // documented
pub const fn push_or_panic(mut self, meta: BranchMeta) -> Self {
if self.idx >= K {
panic!(concat!(
"AsciiTrie Builder: Need more stack (max ",
stringify!(K),
")"
));
}
self.data[self.idx] = Some(meta);
self.idx += 1;
self
}
/// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if
/// the stack is empty.
pub const fn peek_or_panic(&self) -> BranchMeta {
if self.idx == 0 {
panic!("AsciiTrie Builder: Attempted to peek from an empty stack");
}
self.get_or_panic(0)
}
/// Returns a copy of the [`BranchMeta`] at the specified index.
#[allow(clippy::indexing_slicing)] // documented
const fn get_or_panic(&self, index: usize) -> BranchMeta {
if self.idx <= index {
panic!("AsciiTrie Builder: Attempted to get too deep in a stack");
}
match self.data[self.idx - index - 1] {
Some(x) => x,
None => unreachable!(),
}
}
/// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`].
#[allow(clippy::indexing_slicing)] // documented
pub const fn pop_many_or_panic(
mut self,
len: usize,
) -> (Self, ConstArrayBuilder<256, BranchMeta>) {
debug_assert!(len <= 256);
let mut result = ConstArrayBuilder::new_empty([BranchMeta::default(); 256], 256);
let mut ix = 0;
loop {
if ix == len {
break;
}
let i = self.idx - ix - 1;
result = result.const_push_front_or_panic(match self.data[i] {
Some(x) => x,
None => panic!("Not enough items in the ConstLengthsStack"),
});
ix += 1;
}
self.idx -= len;
(self, result)
}
/// Non-const function that returns the initialized elements as a slice.
fn as_slice(&self) -> &[Option<BranchMeta>] {
&self.data[0..self.idx]
}
}
impl<const K: usize> ConstArrayBuilder<K, BranchMeta> {
/// Converts this builder-array of [`BranchMeta`] to one of the `ascii` fields.
pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder<K, u8> {
let mut result = ConstArrayBuilder::new_empty([0; K], K);
let self_as_slice = self.as_const_slice();
const_for_each!(self_as_slice, value, {
result = result.const_push_front_or_panic(value.ascii);
});
result
}
}

51
vendor/zerotrie/src/builder/litemap.rs vendored Normal file
View File

@@ -0,0 +1,51 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Impls for functions gated on the "litemap" feature.
use super::konst::*;
use crate::builder::bytestr::ByteStr;
use crate::error::ZeroTrieBuildError;
use crate::zerotrie::ZeroTrieSimpleAscii;
use crate::ZeroTrie;
use alloc::borrow::Borrow;
use alloc::vec::Vec;
use litemap::LiteMap;
impl ZeroTrieSimpleAscii<Vec<u8>> {
#[doc(hidden)]
pub fn try_from_litemap_with_const_builder<'a, S>(
items: &LiteMap<&'a [u8], usize, S>,
) -> Result<Self, ZeroTrieBuildError>
where
S: litemap::store::StoreSlice<&'a [u8], usize, Slice = [(&'a [u8], usize)]>,
{
let tuples = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
ZeroTrieBuilderConst::<10000>::from_sorted_const_tuple_slice::<100>(byte_str_slice.into())
.map(|s| Self {
store: s.as_bytes().to_vec(),
})
}
}
impl<K, S> TryFrom<&LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
where
// Borrow, not AsRef, because we rely on Ord being the same. Unfortunately
// this means `LiteMap<&str, usize>` does not work.
K: Borrow<[u8]>,
S: litemap::store::StoreSlice<K, usize, Slice = [(K, usize)]>,
{
type Error = ZeroTrieBuildError;
fn try_from(items: &LiteMap<K, usize, S>) -> Result<Self, ZeroTrieBuildError> {
let byte_litemap = items.to_borrowed_keys::<[u8], Vec<_>>();
let byte_slice = byte_litemap.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(byte_slice);
Self::try_from_tuple_slice(byte_str_slice)
}
}
// TODO(#7084): Make this more infallible by calculating the required length,
// heap-allocating the required capacity, and pointing ConstAsciiTrieBuilderStore
// to the heap buffer.

303
vendor/zerotrie/src/builder/mod.rs vendored Normal file
View File

@@ -0,0 +1,303 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! # ZeroTrie Builder
//!
//! There are two implementations of the ZeroTrie Builder:
//!
//! - [konst::ZeroTrieBuilderConst] allows for human-readable const construction
//! - [nonconst::ZeroTrieBuilder] has the full feaure set but requires `alloc`
//!
//! The two builders follow the same algorithm but have different capabilities.
//!
//! ## Builder Algorithm Overview
//!
//! The tries are built backwards, from the last node to the first node. The key step of the
//! algorithm is **determining what is the next node to prepend.**
//!
//! In the simple case of [`ZeroTrieSimpleAscii`], all nodes are binary-search, so if the input
//! strings are provided in lexicographic order, there is a simple, deterministic method for
//! identifying the next node. This insight is what enables us to make the const builder.
//!
//! The builder works with the following intermediate state variables:
//!
//! - `prefix_len` indicates the byte index we are currently processing.
//! - `i` and `j` bracket a window of strings in the input that share the same prefix.
//! - `current_len` is the length in bytes of the current self-contained trie.
//! - `lengths_stack` contains metadata for branch nodes.
//!
//! Consider a trie containing the following strings and values:
//!
//! - "" → 11
//! - "ad" → 22
//! - "adef" → 33
//! - "adghk" → 44
//!
//! Suppose `prefix_len = 2`, `i = 1`, and `j = 4`. This would indicate that we
//! have are evaluating the strings with the "ad" prefix, which extend from
//! index 1 (inclusive) to index 4 (exclusive).
//!
//! What follows is a verbal explanation of the build steps for the above trie.
//! When a node is prepended, it is shown in **boldface**.
//!
//! 1. Initialize the builder by setting `i=3`, `j=4`, `prefix_len=5` (the last string),
//! `current_len=0`, and `lengths_stack` empty. Start the main loop.
//! 2. Top of loop. The string at `i` is equal in length to `prefix_len`, so we prepend
//! our first node: a **value node 44**, which requires a 2-byte varint. Increase
//! `current_len` to 2.
//! 3. Reduce `prefix_len` to 4, read our `key_ascii="k"`, and recalculate `i` and `j`
//! _(this calculation is a long chunk of code in the builder impls)_. Since there is no
//! other string with the prefix "adgh", `i` and `j` stay the same, we prepend an
//! **ASCII node "k"**, increase `current_len` to 3, and continue the main loop.
//! 4. Top of loop. The string at `i` is of length 5, but `prefix_len` is 4, so there is
//! no value node to prepend.
//! 5. Reduce `prefix_len` to 3, read our `key_ascii="h"`, and recalculate `i` and `j`.
//! There are no other strings sharing the prefix "abg", so we prepend an
//! **ASCII node "h"**, increase `current_len` to 4, and continue the main loop.
//! 6. Top of loop. There is still no value node to prepend.
//! 7. Reduce `prefix_len` to 2, read our `key_ascii="g"`, and recalculate `i` and `j`.
//! We find that `i=1` and `j=4`, the range of strings sharing the prefix "ad". Since
//! `i` or `j` changed, proceed to evaluate the branch node.
//! 8. The last branch byte `ascii_j` for this prefix is "g", which is the same as `key_ascii`,
//! so we are the _last_ target of a branch node. Push an entry onto `lengths_stack`:
//! `BranchMeta { ascii: "g", cumulative_length: 4, local_length: 4, count: 1 }`.
//! 9. The first branch byte `ascii_i` for this prefix is "e", which is NOT equal to `key_ascii`,
//! so we are _not the first_ target of a branch node. We therefore start evaluating the
//! string preceding where we were at the top of the current loop. We set `i=2`, `j=3`,
//! `prefix_len=4` (length of the string at `i`), and continue the main loop.
//! 10. Top of loop. Since the string at `i` is equal in length to `prefix_len`, we prepend a
//! **value node 33** (which requires a 2-byte varint) and increase `current_len` to 2.
//! 11. Reduce `prefix_len` to 3, read our `key_ascii="f"`, and recalculate `i` and `j`.
//! They stay the same, so we prepend an **ASCII node "f"**, increase `current_len` to 3,
//! and continue the main loop.
//! 12. Top of loop. No value node this time.
//! 13. Reduce `prefix_len` to 2, read our `key_ascii="e"`, and recalculate `i` and `j`.
//! They go back to `i=1` and `j=4`.
//! 14. The last branch byte `ascii_j` for this prefix is "g", which is NOT equal to `key_ascii`,
//! so we are _not the last_ target of a branch node. We peek at the entry at the front of
//! the lengths stack and use it to push another entry onto the stack:
//! `BranchMeta { ascii: "e", cumulative_length: 7, local_length: 3, count: 2 }`
//! 15. The first branch byte `ascii_i` for this prefix is "e", which is the same as `key_ascii`,
//! wo we are the _first_ target of a branch node. We can therefore proceed to prepend the
//! metadata for the branch node. We peek at the top of the stack and find that there are 2
//! tries reachable from this branch and they have a total byte length of 5. We then pull off
//! 2 entries from the stack into a local variable `branch_metas`. From here, we write out
//! the **offset table**, **lookup table**, and **branch head node**, which are determined
//! from the metadata entries. We set `current_len` to the length of the two tries plus the
//! metadata, which happens to be 11. Then we return to the top of the main loop.
//! 16. Top of loop. The string at `i` is length 2, which is the same as `prefix_len`, so we
//! prepend a **value node 22** (2-byte varint) and increase `current_len` to 13.
//! 17. Reduce `prefix_len` to 1, read our `key_ascii="d"`, and recalculate `i` and `j`.
//! They stay the same, so we prepend an **ASCII node "d"**, increase `current_len` to 14,
//! and continue the main loop.
//! 18. Top of loop. No value node this time.
//! 19. Reduce `prefix_len` to 0, read our `key_ascii="a"`, and recalculate `i` and `j`.
//! They change to `i=0` and `j=4`, since all strings have the empty string as a prefix.
//! However, `ascii_i` and `ascii_j` both equal `key_ascii`, so we prepend **ASCII node "a"**,
//! increase `current_len` to 15, and continue the main loop.
//! 16. Top of loop. The string at `i` is length 0, which is the same as `prefix_len`, so we
//! prepend a **value node 11** and increase `current_len` to 16.
//! 17. We can no longer reduce `prefix_len`, so our trie is complete.
//!
//! ## Perfect Hash Reordering
//!
//! When the PHF is added to the mix, the main change is that the strings are no longer in sorted
//! order when they are in the trie. To resolve this issue, when adding a branch node, the target
//! tries are rearranged in-place in the buffer to be in the correct order for the PHF.
//!
//! ## Example
//!
//! Here is the output of the trie described above.
//!
//! ```
//! use zerotrie::ZeroTrieSimpleAscii;
//!
//! const DATA: [(&str, usize); 4] =
//! [("", 11), ("ad", 22), ("adef", 33), ("adghk", 44)];
//!
//! // As demonstrated above, the required capacity for this trie is 16 bytes
//! const TRIE: ZeroTrieSimpleAscii<[u8; 16]> =
//! ZeroTrieSimpleAscii::from_sorted_str_tuples(&DATA);
//!
//! assert_eq!(
//! TRIE.as_bytes(),
//! &[
//! 0x8B, // value node 11
//! b'a', // ASCII node 'a'
//! b'd', // ASCII node 'd'
//! 0x90, // value node 22 lead byte
//! 0x06, // value node 22 trail byte
//! 0xC2, // branch node 2
//! b'e', // first target of branch
//! b'g', // second target of branch
//! 3, // offset
//! b'f', // ASCII node 'f'
//! 0x90, // value node 33 lead byte
//! 0x11, // value node 33 trail byte
//! b'h', // ASCII node 'h'
//! b'k', // ASCII node 'k'
//! 0x90, // value node 44 lead byte
//! 0x1C, // value node 44 trail byte
//! ]
//! );
//!
//! assert_eq!(TRIE.get(b""), Some(11));
//! assert_eq!(TRIE.get(b"ad"), Some(22));
//! assert_eq!(TRIE.get(b"adef"), Some(33));
//! assert_eq!(TRIE.get(b"adghk"), Some(44));
//! assert_eq!(TRIE.get(b"unknown"), None);
//! ```
mod branch_meta;
pub(crate) mod bytestr;
pub(crate) mod konst;
#[cfg(feature = "litemap")]
mod litemap;
#[cfg(feature = "alloc")]
pub(crate) mod nonconst;
use bytestr::ByteStr;
use super::ZeroTrieSimpleAscii;
impl<const N: usize> ZeroTrieSimpleAscii<[u8; N]> {
/// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values.
///
/// This function needs to know the exact length of the resulting trie at compile time. To
/// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting
/// compile error which will tell you how to set `N`, like this:
///
/// > the evaluated program panicked at 'Buffer too large. Size needed: 17'
///
/// That error message says you need to set `N` to 17.
///
/// Also see [`Self::from_sorted_str_tuples`].
///
/// # Panics
///
/// Panics if `items` is not sorted or if `N` is not correct.
///
/// # Examples
///
/// Create a `const` ZeroTrieSimpleAscii at compile time:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // The required capacity for this trie happens to be 17 bytes
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> =
/// ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
///
/// assert_eq!(TRIE.get(b"foo"), Some(1));
/// assert_eq!(TRIE.get(b"bar"), Some(2));
/// assert_eq!(TRIE.get(b"bazzoo"), Some(3));
/// assert_eq!(TRIE.get(b"unknown"), None);
/// ```
///
/// Panics if strings are not sorted:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"foo", 1),
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// ]);
/// ```
///
/// Panics if capacity is too small:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 15]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
/// ```
///
/// Panics if capacity is too large:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 20]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[
/// (b"bar", 2),
/// (b"bazzoo", 3),
/// (b"foo", 1),
/// ]);
/// ```
pub const fn from_sorted_u8_tuples(tuples: &[(&[u8], usize)]) -> Self {
use konst::*;
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice);
match result {
Ok(s) => Self::from_store(s.build_or_panic()),
Err(_) => panic!("Failed to build ZeroTrie"),
}
}
/// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values.
///
/// This function needs to know the exact length of the resulting trie at compile time. To
/// figure out `N`, first set `N` to be too large (say 0xFFFF), then look at the resulting
/// compile error which will tell you how to set `N`, like this:
///
/// > the evaluated program panicked at 'Buffer too large. Size needed: 17'
///
/// That error message says you need to set `N` to 17.
///
/// Also see [`Self::from_sorted_u8_tuples`].
///
/// # Panics
///
/// Panics if `items` is not sorted, if `N` is not correct, or if any of the strings contain
/// non-ASCII characters.
///
/// # Examples
///
/// Create a `const` ZeroTrieSimpleAscii at compile time:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // The required capacity for this trie happens to be 17 bytes
/// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> =
/// ZeroTrieSimpleAscii::from_sorted_str_tuples(&[
/// ("bar", 2),
/// ("bazzoo", 3),
/// ("foo", 1),
/// ]);
///
/// assert_eq!(TRIE.get(b"foo"), Some(1));
/// assert_eq!(TRIE.get(b"bar"), Some(2));
/// assert_eq!(TRIE.get(b"bazzoo"), Some(3));
/// assert_eq!(TRIE.get(b"unknown"), None);
/// ```
///
/// Panics if the strings are not ASCII:
///
/// ```compile_fail
/// # use zerotrie::ZeroTrieSimpleAscii;
/// const TRIE: ZeroTrieSimpleAscii<[u8; 100]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[
/// ("bár", 2),
/// ("båzzöo", 3),
/// ("foo", 1),
/// ]);
/// ```
pub const fn from_sorted_str_tuples(tuples: &[(&str, usize)]) -> Self {
use konst::*;
let byte_str_slice = ByteStr::from_str_slice_with_value(tuples);
// 100 is the value of `K`, the size of the lengths stack. If compile errors are
// encountered, this number may need to be increased.
let result = ZeroTrieBuilderConst::<N>::from_tuple_slice::<100>(byte_str_slice);
match result {
Ok(s) => Self::from_store(s.build_or_panic()),
Err(_) => panic!("Failed to build ZeroTrie"),
}
}
}

View File

@@ -0,0 +1,420 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use core::cmp::Ordering;
use super::super::branch_meta::BranchMeta;
use super::store::NonConstLengthsStack;
use super::store::TrieBuilderStore;
use crate::builder::bytestr::ByteStr;
use crate::byte_phf::PerfectByteHashMapCacheOwned;
use crate::error::ZeroTrieBuildError;
use crate::options::*;
use crate::varint;
use alloc::borrow::Cow;
use alloc::vec::Vec;
/// A low-level builder for ZeroTrie. Supports all options.
pub(crate) struct ZeroTrieBuilder<S> {
data: S,
phf_cache: PerfectByteHashMapCacheOwned,
options: ZeroTrieBuilderOptions,
}
impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
/// Returns the trie data as a `Vec<u8>`.
pub fn to_bytes(&self) -> Vec<u8> {
self.data.atbs_to_bytes()
}
/// Prepends a byte value to the front of the builder. If it is ASCII, an ASCII
/// node is prepended. If it is non-ASCII, if there is already a span node at
/// the front, we modify the span node to add the new byte; otherwise, we create
/// a new span node. Returns the delta in length, which is either 1 or 2.
fn prepend_ascii(&mut self, ascii: u8) -> Result<usize, ZeroTrieBuildError> {
if ascii <= 127 {
self.data.atbs_push_front(ascii);
Ok(1)
} else if matches!(self.options.ascii_mode, AsciiMode::BinarySpans) {
if let Some(old_front) = self.data.atbs_pop_front() {
let old_byte_len = self.data.atbs_len() + 1;
if old_front & 0b11100000 == 0b10100000 {
// Extend an existing span
// Unwrap OK: there is a varint at this location in the buffer
#[expect(clippy::unwrap_used)]
let old_span_size =
varint::try_read_varint_meta3_from_tstore(old_front, &mut self.data)
.unwrap();
self.data.atbs_push_front(ascii);
let varint_array = varint::write_varint_meta3(old_span_size + 1);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b10100000);
let new_byte_len = self.data.atbs_len();
return Ok(new_byte_len - old_byte_len);
} else {
self.data.atbs_push_front(old_front);
}
}
// Create a new span
self.data.atbs_push_front(ascii);
self.data.atbs_push_front(0b10100001);
Ok(2)
} else {
Err(ZeroTrieBuildError::NonAsciiError)
}
}
/// Prepends a value node to the front of the builder. Returns the
/// delta in length, which depends on the size of the varint.
#[must_use]
fn prepend_value(&mut self, value: usize) -> usize {
let varint_array = varint::write_varint_meta3(value);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b10000000);
varint_array.len()
}
/// Prepends a branch node to the front of the builder. Returns the
/// delta in length, which depends on the size of the varint.
#[must_use]
fn prepend_branch(&mut self, value: usize) -> usize {
let varint_array = varint::write_varint_meta2(value);
self.data.atbs_extend_front(varint_array.as_slice());
self.data.atbs_bitor_assign(0, 0b11000000);
varint_array.len()
}
/// Prepends multiple arbitrary bytes to the front of the builder. Returns the
/// delta in length, which is the length of the slice.
#[must_use]
fn prepend_slice(&mut self, s: &[u8]) -> usize {
self.data.atbs_extend_front(s);
s.len()
}
/// Builds a ZeroTrie from an iterator of bytes. It first collects and sorts the iterator.
pub fn from_bytes_iter<K: AsRef<[u8]>, I: IntoIterator<Item = (K, usize)>>(
iter: I,
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
let items = Vec::<(K, usize)>::from_iter(iter);
let mut items = items
.iter()
.map(|(k, v)| (k.as_ref(), *v))
.collect::<Vec<(&[u8], usize)>>();
items.sort_by(|a, b| cmp_keys_values(options, *a, *b));
let ascii_str_slice = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice);
Self::from_sorted_tuple_slice_impl(byte_str_slice, options)
}
/// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted,
/// except for a case-insensitive trie where the items are re-sorted.
///
/// # Panics
///
/// May panic if the items are not sorted.
pub fn from_sorted_tuple_slice(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
let mut items = Cow::Borrowed(items);
if matches!(options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// We need to re-sort the items with our custom comparator.
items.to_mut().sort_by(|a, b| {
cmp_keys_values(options, (a.0.as_bytes(), a.1), (b.0.as_bytes(), b.1))
});
}
Self::from_sorted_tuple_slice_impl(&items, options)
}
/// Internal constructor that does not re-sort the items.
fn from_sorted_tuple_slice_impl(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
#[allow(clippy::indexing_slicing)] // a debug assertion only
for ab in items.windows(2) {
debug_assert!(cmp_keys_values(
options,
(ab[0].0.as_bytes(), ab[0].1),
(ab[1].0.as_bytes(), ab[1].1)
)
.is_lt());
}
let mut result = Self {
data: S::atbs_new_empty(),
phf_cache: PerfectByteHashMapCacheOwned::new_empty(),
options,
};
let total_size = result.create(items)?;
debug_assert!(total_size == result.data.atbs_len());
Ok(result)
}
/// The actual builder algorithm. For an explanation, see [`crate::builder`].
#[expect(clippy::unwrap_used)] // lots of indexing, but all indexes should be in range
fn create(&mut self, all_items: &[(&ByteStr, usize)]) -> Result<usize, ZeroTrieBuildError> {
let mut prefix_len = match all_items.last() {
Some(x) => x.0.len(),
// Empty slice:
None => return Ok(0),
};
// Initialize the main loop to point at the last string.
let mut lengths_stack = NonConstLengthsStack::new();
let mut i = all_items.len() - 1;
let mut j = all_items.len();
let mut current_len = 0;
// Start the main loop.
loop {
let item_i = all_items.get(i).unwrap();
let item_j = all_items.get(j - 1).unwrap();
debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
// Check if we need to add a value node here.
if item_i.0.len() == prefix_len {
let len = self.prepend_value(item_i.1);
current_len += len;
}
if prefix_len == 0 {
// All done! Leave the main loop.
break;
}
// Reduce the prefix length by 1 and recalculate i and j.
prefix_len -= 1;
let mut new_i = i;
let mut new_j = j;
let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
debug_assert_eq!(ascii_i, ascii_j);
let key_ascii = ascii_i;
loop {
if new_i == 0 {
break;
}
let candidate = all_items.get(new_i - 1).unwrap().0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_i.0.prefix_eq(candidate, prefix_len) {
new_i -= 1;
} else {
break;
}
if candidate.len() == prefix_len {
// A string that equals the prefix does not take part in the branch node.
break;
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_i {
ascii_i = candidate;
}
}
loop {
if new_j == all_items.len() {
break;
}
let candidate = all_items.get(new_j).unwrap().0;
if candidate.len() < prefix_len {
// Too short
break;
}
if item_j.0.prefix_eq(candidate, prefix_len) {
new_j += 1;
} else {
break;
}
if candidate.len() == prefix_len {
unreachable!("A shorter string should be earlier in the sequence");
}
let candidate = candidate.byte_at_or_panic(prefix_len);
if candidate != ascii_j {
ascii_j = candidate;
}
}
// If there are no different bytes at this prefix level, we can add an ASCII or Span
// node and then continue to the next iteration of the main loop.
if ascii_i == key_ascii && ascii_j == key_ascii {
let len = self.prepend_ascii(key_ascii)?;
current_len += len;
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase)
&& i == new_i + 2
{
// This can happen if two strings were picked up, each with a different case
return Err(ZeroTrieBuildError::MixedCase);
}
debug_assert!(
i == new_i || i == new_i + 1,
"only the exact prefix string can be picked up at this level: {key_ascii}"
);
i = new_i;
debug_assert_eq!(j, new_j);
continue;
}
// If i and j changed, we are a target of a branch node.
if ascii_j == key_ascii {
// We are the _last_ target of a branch node.
lengths_stack.push(BranchMeta {
ascii: key_ascii,
cumulative_length: current_len,
local_length: current_len,
count: 1,
});
} else {
// We are the _not the last_ target of a branch node.
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
lengths_stack.push(BranchMeta {
ascii: key_ascii,
cumulative_length: cumulative_length + current_len,
local_length: current_len,
count: count + 1,
});
}
if ascii_i != key_ascii {
// We are _not the first_ target of a branch node.
// Set the cursor to the previous string and continue the loop.
j = i;
i -= 1;
prefix_len = all_items.get(i).unwrap().0.len();
current_len = 0;
continue;
}
// Branch (first)
// std::println!("lengths_stack: {lengths_stack:?}");
let (total_length, total_count) = {
let BranchMeta {
cumulative_length,
count,
..
} = lengths_stack.peek_or_panic();
(cumulative_length, count)
};
let mut branch_metas = lengths_stack.pop_many_or_panic(total_count);
let original_keys = branch_metas.map_to_ascii_bytes();
if matches!(self.options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// Check to see if we have the same letter in two different cases
let mut seen_ascii_alpha = [false; 26];
for c in original_keys.as_const_slice().as_slice() {
if c.is_ascii_alphabetic() {
let i = (c.to_ascii_lowercase() - b'a') as usize;
#[allow(clippy::indexing_slicing)] // 26 letters
if seen_ascii_alpha[i] {
return Err(ZeroTrieBuildError::MixedCase);
} else {
seen_ascii_alpha[i] = true;
}
}
}
}
let use_phf = matches!(self.options.phf_mode, PhfMode::UsePhf);
let opt_phf_vec = if total_count > 15 && use_phf {
let phf_vec = self
.phf_cache
.try_get_or_insert(original_keys.as_const_slice().as_slice().to_vec())?;
// Put everything in order via bubble sort
// Note: branch_metas is stored in reverse order (0 = last element)
loop {
let mut l = total_count - 1;
let mut changes = 0;
let mut start = 0;
while l > 0 {
let a = *branch_metas.as_const_slice().get_or_panic(l);
let b = *branch_metas.as_const_slice().get_or_panic(l - 1);
let a_idx = phf_vec.keys().iter().position(|x| x == &a.ascii).unwrap();
let b_idx = phf_vec.keys().iter().position(|x| x == &b.ascii).unwrap();
if a_idx > b_idx {
// std::println!("{a:?} <=> {b:?} ({phf_vec:?})");
// This method call won't panic because the ranges are valid.
self.data.atbs_swap_ranges(
start,
start + a.local_length,
start + a.local_length + b.local_length,
);
branch_metas = branch_metas.swap_or_panic(l - 1, l);
start += b.local_length;
changes += 1;
// FIXME: fix the `length` field
} else {
start += a.local_length;
}
l -= 1;
}
if changes == 0 {
break;
}
}
Some(phf_vec)
} else {
None
};
// Write out the offset table
current_len = total_length;
const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
if w > 3 && matches!(self.options.capacity_mode, CapacityMode::Normal) {
return Err(ZeroTrieBuildError::CapacityExceeded);
}
let mut k = 0;
while k <= w {
self.data.atbs_prepend_n_zeros(total_count - 1);
current_len += total_count - 1;
let mut l = 0;
let mut length_to_write = 0;
while l < total_count {
let BranchMeta { local_length, .. } = *branch_metas
.as_const_slice()
.get_or_panic(total_count - l - 1);
let mut adjusted_length = length_to_write;
let mut m = 0;
while m < k {
adjusted_length >>= 8;
m += 1;
}
if l > 0 {
self.data.atbs_bitor_assign(l - 1, adjusted_length as u8);
}
l += 1;
length_to_write += local_length;
}
k += 1;
}
// Write out the lookup table
assert!(0 < total_count && total_count <= 256);
let branch_value = (w << 8) + (total_count & 0xff);
if let Some(phf_vec) = opt_phf_vec {
self.data.atbs_extend_front(phf_vec.as_bytes());
let phf_len = phf_vec.as_bytes().len();
let branch_len = self.prepend_branch(branch_value);
current_len += phf_len + branch_len;
} else {
let search_len = self.prepend_slice(original_keys.as_slice());
let branch_len = self.prepend_branch(branch_value);
current_len += search_len + branch_len;
}
i = new_i;
j = new_j;
}
assert!(lengths_stack.is_empty());
Ok(current_len)
}
}
fn cmp_keys_values(
options: ZeroTrieBuilderOptions,
a: (&[u8], usize),
b: (&[u8], usize),
) -> Ordering {
if matches!(options.case_sensitivity, CaseSensitivity::Sensitive) {
a.0.cmp(b.0)
} else {
let a_iter = a.0.iter().map(|x| x.to_ascii_lowercase());
let b_iter = b.0.iter().map(|x| x.to_ascii_lowercase());
Iterator::cmp(a_iter, b_iter)
}
.then_with(|| a.1.cmp(&b.1))
}

View File

@@ -0,0 +1,9 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mod builder;
mod store;
pub(crate) use builder::*;
pub(crate) use store::TrieBuilderStore;

View File

@@ -0,0 +1,192 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! This module contains internal collections for the non-const builder.
use super::super::branch_meta::BranchMeta;
use super::super::konst::ConstArrayBuilder;
use alloc::collections::VecDeque;
use alloc::vec::Vec;
/// A trait applied to a data structure for building a ZeroTrie.
pub(crate) trait TrieBuilderStore {
/// Create a new empty store.
fn atbs_new_empty() -> Self;
/// Return the length in bytes of the store.
fn atbs_len(&self) -> usize;
/// Push a byte to the front of the store.
fn atbs_push_front(&mut self, byte: u8);
/// Push multiple bytes to the front of the store.
fn atbs_extend_front(&mut self, other: &[u8]);
/// Read the store into a `Vec<u8>`.
fn atbs_to_bytes(&self) -> Vec<u8>;
/// Perform the operation `self[index] |= bits`
fn atbs_bitor_assign(&mut self, index: usize, bits: u8);
/// Swap the adjacent ranges `self[start..mid]` and `self[mid..limit]`.
///
/// # Panics
///
/// Panics if the specified ranges are invalid.
fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize);
/// Remove and return the first element in the store, or `None` if empty.
fn atbs_pop_front(&mut self) -> Option<u8>;
/// Prepend `n` zeros to the front of the store.
fn atbs_prepend_n_zeros(&mut self, n: usize) {
let mut i = 0;
while i < n {
self.atbs_push_front(0);
i += 1;
}
}
}
impl TrieBuilderStore for VecDeque<u8> {
fn atbs_new_empty() -> Self {
VecDeque::new()
}
fn atbs_len(&self) -> usize {
self.len()
}
fn atbs_push_front(&mut self, byte: u8) {
self.push_front(byte);
}
fn atbs_extend_front(&mut self, other: &[u8]) {
self.reserve(other.len());
for b in other.iter().rev() {
self.push_front(*b);
}
}
fn atbs_to_bytes(&self) -> Vec<u8> {
let mut v = Vec::with_capacity(self.len());
let (a, b) = self.as_slices();
v.extend(a);
v.extend(b);
v
}
fn atbs_bitor_assign(&mut self, index: usize, bits: u8) {
self[index] |= bits;
}
/// # Panics
/// Panics if the specified ranges are invalid.
#[allow(clippy::panic)] // documented
fn atbs_swap_ranges(&mut self, mut start: usize, mut mid: usize, mut limit: usize) {
if start > mid || mid > limit {
panic!("Invalid args to atbs_swap_ranges(): start > mid || mid > limit");
}
if limit > self.len() {
panic!(
"Invalid args to atbs_swap_ranges(): limit out of range: {limit} > {}",
self.len()
);
}
// The following algorithm is an in-place swap of two adjacent ranges of potentially
// different lengths. Would make a good coding interview question.
loop {
if start == mid || mid == limit {
return;
}
let len0 = mid - start;
let len1 = limit - mid;
let mut i = start;
let mut j = limit - core::cmp::min(len0, len1);
while j < limit {
self.swap(i, j);
i += 1;
j += 1;
}
if len0 < len1 {
mid = start + len0;
limit -= len0;
} else {
start += len1;
mid = limit - len1;
}
}
}
fn atbs_pop_front(&mut self) -> Option<u8> {
self.pop_front()
}
}
/// A data structure that holds any number of [`BranchMeta`] items.
pub(crate) struct NonConstLengthsStack {
data: Vec<BranchMeta>,
}
impl core::fmt::Debug for NonConstLengthsStack {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
self.as_slice().fmt(f)
}
}
impl NonConstLengthsStack {
/// Creates a new empty [`NonConstLengthsStack`].
pub const fn new() -> Self {
Self { data: Vec::new() }
}
/// Returns whether the stack is empty.
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
/// Adds a [`BranchMeta`] to the stack.
pub fn push(&mut self, meta: BranchMeta) {
self.data.push(meta);
}
/// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if
/// the stack is empty.
#[allow(clippy::unwrap_used)] // "panic" is in the method name
pub fn peek_or_panic(&self) -> BranchMeta {
*self.data.last().unwrap()
}
/// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`].
pub fn pop_many_or_panic(&mut self, len: usize) -> ConstArrayBuilder<256, BranchMeta> {
debug_assert!(len <= 256);
let mut result = ConstArrayBuilder::new_empty([BranchMeta::default(); 256], 256);
let mut ix = 0;
loop {
if ix == len {
break;
}
let i = self.data.len() - ix - 1;
// Won't panic because len <= 256
result = result.const_push_front_or_panic(match self.data.get(i) {
Some(x) => *x,
None => unreachable!("Not enough items in the ConstLengthsStack"),
});
ix += 1;
}
self.data.truncate(self.data.len() - len);
result
}
/// Non-const function that returns the initialized elements as a slice.
fn as_slice(&self) -> &[BranchMeta] {
&self.data
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_swap_ranges() {
let s = b"..abcdefghijkl=";
let mut s = s.iter().copied().collect::<VecDeque<u8>>();
s.atbs_swap_ranges(2, 7, 14);
assert_eq!(s.atbs_to_bytes(), b"..fghijklabcde=");
}
}

214
vendor/zerotrie/src/byte_phf/builder.rs vendored Normal file
View File

@@ -0,0 +1,214 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::*;
use crate::error::ZeroTrieBuildError;
use alloc::vec;
use alloc::vec::Vec;
/// To speed up the search algorithm, we limit the number of times the level-2 parameter (q)
/// can hit its max value (initially Q_FAST_MAX) before we try the next level-1 parameter (p).
/// In practice, this has a small impact on the resulting perfect hash, resulting in about
/// 1 in 10000 hash maps that fall back to the slow path.
const MAX_L2_SEARCH_MISSES: usize = 24;
/// Directly compute the perfect hash function.
///
/// Returns `(p, [q_0, q_1, ..., q_(N-1)])`, or an error if the PHF could not be computed.
#[allow(unused_labels)] // for readability
#[allow(clippy::indexing_slicing)] // carefully reviewed to not panic
pub fn find(bytes: &[u8]) -> Result<(u8, Vec<u8>), ZeroTrieBuildError> {
let n_usize = bytes.len();
let mut p = 0u8;
let mut qq = vec![0u8; n_usize];
let mut bqs = vec![0u8; n_usize];
let mut seen = vec![false; n_usize];
let max_allowable_p = P_FAST_MAX;
let mut max_allowable_q = Q_FAST_MAX;
#[allow(non_snake_case)]
let N = if n_usize > 0 && n_usize < 256 {
n_usize as u8
} else {
debug_assert!(n_usize == 0 || n_usize == 256);
return Ok((p, qq));
};
'p_loop: loop {
// Vec of tuples: (index, bucket count)
let mut buckets: Vec<(usize, Vec<u8>)> = (0..n_usize).map(|i| (i, vec![])).collect();
for byte in bytes {
let l1 = f1(*byte, p, N) as usize;
buckets[l1].1.push(*byte);
}
buckets.sort_by_key(|(_, v)| -(v.len() as isize));
// println!("New P: p={p:?}, buckets={buckets:?}");
let mut i = 0;
let mut num_max_q = 0;
bqs.fill(0);
seen.fill(false);
'q_loop: loop {
// Loop condition: exit when i is beyond the buckets length
if i == buckets.len() {
for (local_j, real_j) in buckets.iter().map(|(j, _)| *j).enumerate() {
debug_assert!(local_j < n_usize); // comes from .enumerate()
debug_assert!(real_j < n_usize); // first item of bucket tuple is an index
qq[real_j] = bqs[local_j];
}
// println!("Success: p={p:?}, num_max_q={num_max_q:?}, bqs={bqs:?}, qq={qq:?}");
// if num_max_q > 0 {
// println!("num_max_q={num_max_q:?}");
// }
return Ok((p, qq));
}
let mut bucket = buckets[i].1.as_slice();
'byte_loop: for (j, byte) in bucket.iter().enumerate() {
let l2 = f2(*byte, bqs[i], N) as usize;
if seen[l2] {
// println!("Skipping Q: p={p:?}, i={i:?}, byte={byte:}, q={i:?}, l2={:?}", f2(*byte, bqs[i], N));
for k_byte in &bucket[0..j] {
let l2 = f2(*k_byte, bqs[i], N) as usize;
assert!(seen[l2]);
seen[l2] = false;
}
'reset_loop: loop {
if bqs[i] < max_allowable_q {
bqs[i] += 1;
continue 'q_loop;
}
num_max_q += 1;
bqs[i] = 0;
if i == 0 || num_max_q > MAX_L2_SEARCH_MISSES {
if p == max_allowable_p && max_allowable_q != Q_REAL_MAX {
// println!("Could not solve fast function: trying again: {bytes:?}");
max_allowable_q = Q_REAL_MAX;
p = 0;
continue 'p_loop;
} else if p == max_allowable_p {
// If a fallback algorithm for `p` is added, relax this assertion
// and re-run the loop with a higher `max_allowable_p`.
debug_assert_eq!(max_allowable_p, P_REAL_MAX);
// println!("Could not solve PHF function");
return Err(ZeroTrieBuildError::CouldNotSolvePerfectHash);
} else {
p += 1;
continue 'p_loop;
}
}
i -= 1;
bucket = buckets[i].1.as_slice();
for byte in bucket {
let l2 = f2(*byte, bqs[i], N) as usize;
assert!(seen[l2]);
seen[l2] = false;
}
}
} else {
// println!("Marking as seen: i={i:?}, byte={byte:}, l2={:?}", f2(*byte, bqs[i], N));
let l2 = f2(*byte, bqs[i], N) as usize;
seen[l2] = true;
}
}
// println!("Found Q: i={i:?}, q={:?}", bqs[i]);
i += 1;
}
}
}
impl PerfectByteHashMap<Vec<u8>> {
/// Computes a new [`PerfectByteHashMap`].
///
/// (this is a doc-hidden API)
#[allow(clippy::indexing_slicing)] // carefully reviewed to not panic
pub fn try_new(keys: &[u8]) -> Result<Self, ZeroTrieBuildError> {
let n_usize = keys.len();
let n = n_usize as u8;
let (p, mut qq) = find(keys)?;
let mut keys_permuted = vec![0; n_usize];
for key in keys {
let l1 = f1(*key, p, n) as usize;
let q = qq[l1];
let l2 = f2(*key, q, n) as usize;
keys_permuted[l2] = *key;
}
let mut result = Vec::with_capacity(n_usize * 2 + 1);
result.push(p);
result.append(&mut qq);
result.append(&mut keys_permuted);
Ok(Self(result))
}
}
#[cfg(test)]
mod tests {
use super::*;
extern crate std;
use std::print;
use std::println;
fn print_byte_to_stdout(byte: u8) {
let c = char::from(byte);
if c.is_ascii_alphanumeric() {
print!("'{c}'");
} else {
print!("0x{byte:X}");
}
}
fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
use rand::seq::SliceRandom;
use rand::SeedableRng;
let mut bytes: Vec<u8> =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789".into();
let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
bytes.partial_shuffle(&mut rng, len).0.into()
}
#[test]
fn test_random_distributions() {
let mut p_distr = vec![0; 256];
let mut q_distr = vec![0; 256];
for len in 0..50 {
for seed in 0..50 {
let bytes = random_alphanums(seed, len);
let (p, qq) = find(bytes.as_slice()).unwrap();
p_distr[p as usize] += 1;
for q in qq {
q_distr[q as usize] += 1;
}
}
}
println!("p_distr: {p_distr:?}");
println!("q_distr: {q_distr:?}");
let fast_p = p_distr[0..=P_FAST_MAX as usize].iter().sum::<usize>();
let slow_p = p_distr[(P_FAST_MAX + 1) as usize..].iter().sum::<usize>();
let fast_q = q_distr[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let slow_q = q_distr[(Q_FAST_MAX + 1) as usize..].iter().sum::<usize>();
assert_eq!(2500, fast_p);
assert_eq!(0, slow_p);
assert_eq!(61243, fast_q);
assert_eq!(7, slow_q);
let bytes = random_alphanums(0, 16);
#[allow(non_snake_case)]
let N = u8::try_from(bytes.len()).unwrap();
let (p, qq) = find(bytes.as_slice()).unwrap();
println!("Results:");
for byte in bytes.iter() {
print_byte_to_stdout(*byte);
let l1 = f1(*byte, p, N) as usize;
let q = qq[l1];
let l2 = f2(*byte, q, N) as usize;
println!(" => l1 {l1} => q {q} => l2 {l2}");
}
}
}

View File

@@ -0,0 +1,39 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use super::*;
use crate::error::ZeroTrieBuildError;
use alloc::collections::btree_map::Entry;
use alloc::collections::BTreeMap;
use alloc::vec::Vec;
/// Helper class for caching the results of multiple [`PerfectByteHashMap`] calculations.
pub struct PerfectByteHashMapCacheOwned {
// Note: This should probably be a HashMap but that isn't in `alloc`
data: BTreeMap<Vec<u8>, PerfectByteHashMap<Vec<u8>>>,
}
impl PerfectByteHashMapCacheOwned {
/// Creates a new empty instance.
pub fn new_empty() -> Self {
Self {
data: BTreeMap::new(),
}
}
/// Gets the [`PerfectByteHashMap`] for the given bytes, calculating it if necessary.
pub fn try_get_or_insert(
&mut self,
keys: Vec<u8>,
) -> Result<&PerfectByteHashMap<[u8]>, ZeroTrieBuildError> {
let mut_phf = match self.data.entry(keys) {
Entry::Vacant(entry) => {
let value = PerfectByteHashMap::try_new(entry.key())?;
entry.insert(value)
}
Entry::Occupied(entry) => entry.into_mut(),
};
Ok(mut_phf.as_borrowed())
}
}

485
vendor/zerotrie/src/byte_phf/mod.rs vendored Normal file
View File

@@ -0,0 +1,485 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![allow(rustdoc::private_intra_doc_links)] // doc(hidden) module
//! # Byte Perfect Hash Function Internals
//!
//! This module contains a perfect hash function (PHF) designed for a fast, compact perfect
//! hash over 1 to 256 nodes (bytes).
//!
//! The PHF uses the following variables:
//!
//! 1. A single parameter `p`, which is 0 in about 98% of cases.
//! 2. A list of `N` parameters `q_t`, one per _bucket_
//! 3. The `N` keys in an arbitrary order determined by the PHF
//!
//! Reading a `key` from the PHF uses the following algorithm:
//!
//! 1. Let `t`, the bucket index, be `f1(key, p)`.
//! 2. Let `i`, the key index, be `f2(key, q_t)`.
//! 3. If `key == k_i`, return `Some(i)`; else return `None`.
//!
//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across
//! serialization versions of `ZeroTrie`. They are very fast, constant-time operations as long
//! as `p` <= [`P_FAST_MAX`] and `q` <= [`Q_FAST_MAX`]. In practice, nearly 100% of parameter
//! values are in the fast range.
//!
//! ```
//! use zerotrie::_internal::PerfectByteHashMap;
//!
//! let phf_example_bytes = [
//! // `p` parameter
//! 1, // `q` parameters, one for each of the N buckets
//! 0, 0, 1, 1, // Exact keys to be compared with the input
//! b'e', b'a', b'c', b'g',
//! ];
//!
//! let phf = PerfectByteHashMap::from_bytes(&phf_example_bytes);
//!
//! // The PHF returns the index of the key or `None` if not found.
//! assert_eq!(phf.get(b'a'), Some(1));
//! assert_eq!(phf.get(b'b'), None);
//! assert_eq!(phf.get(b'c'), Some(2));
//! assert_eq!(phf.get(b'd'), None);
//! assert_eq!(phf.get(b'e'), Some(0));
//! assert_eq!(phf.get(b'f'), None);
//! assert_eq!(phf.get(b'g'), Some(3));
//! ```
use crate::helpers::*;
#[cfg(feature = "alloc")]
mod builder;
#[cfg(feature = "alloc")]
mod cached_owned;
#[cfg(feature = "alloc")]
pub use cached_owned::PerfectByteHashMapCacheOwned;
/// The cutoff for the fast version of [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_FAST_MAX: u8 = 95;
/// The cutoff for the fast version of [`f2`].
const Q_FAST_MAX: u8 = 95;
/// The maximum allowable value of `p`. This could be raised if found to be necessary.
/// Values exceeding P_FAST_MAX could use a different `p` algorithm by modifying [`f1`].
#[cfg(feature = "alloc")] // used in the builder code
const P_REAL_MAX: u8 = P_FAST_MAX;
/// The maximum allowable value of `q`. This could be raised if found to be necessary.
#[cfg(feature = "alloc")] // used in the builder code
const Q_REAL_MAX: u8 = 127;
/// Calculates the function `f1` for the PHF. For the exact formula, please read the code.
///
/// When `p == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f1;
/// const N: u8 = 10;
///
/// // With p = 0:
/// assert_eq!(0, f1(0, 0, N));
/// assert_eq!(1, f1(1, 0, N));
/// assert_eq!(2, f1(2, 0, N));
/// assert_eq!(9, f1(9, 0, N));
/// assert_eq!(0, f1(10, 0, N));
/// assert_eq!(1, f1(11, 0, N));
/// assert_eq!(2, f1(12, 0, N));
/// assert_eq!(9, f1(19, 0, N));
///
/// // With p = 1:
/// assert_eq!(1, f1(0, 1, N));
/// assert_eq!(0, f1(1, 1, N));
/// assert_eq!(2, f1(2, 1, N));
/// assert_eq!(2, f1(9, 1, N));
/// assert_eq!(4, f1(10, 1, N));
/// assert_eq!(5, f1(11, 1, N));
/// assert_eq!(1, f1(12, 1, N));
/// assert_eq!(7, f1(19, 1, N));
/// ```
#[inline]
pub fn f1(byte: u8, p: u8, n: u8) -> u8 {
if n == 0 {
byte
} else if p == 0 {
byte % n
} else {
// `p` always uses the below constant-time operation. If needed, we
// could add some other operation here with `p > P_FAST_MAX` to solve
// difficult cases if the need arises.
let result = byte ^ p ^ byte.wrapping_shr(p as u32);
result % n
}
}
/// Calculates the function `f2` for the PHF. For the exact formula, please read the code.
///
/// When `q == 0`, the operation is a simple modulus.
///
/// The argument `n` is used only for taking the modulus so that the return value is
/// in the range `[0, n)`.
///
/// # Examples
///
/// ```
/// use zerotrie::_internal::f2;
/// const N: u8 = 10;
///
/// // With q = 0:
/// assert_eq!(0, f2(0, 0, N));
/// assert_eq!(1, f2(1, 0, N));
/// assert_eq!(2, f2(2, 0, N));
/// assert_eq!(9, f2(9, 0, N));
/// assert_eq!(0, f2(10, 0, N));
/// assert_eq!(1, f2(11, 0, N));
/// assert_eq!(2, f2(12, 0, N));
/// assert_eq!(9, f2(19, 0, N));
///
/// // With q = 1:
/// assert_eq!(1, f2(0, 1, N));
/// assert_eq!(0, f2(1, 1, N));
/// assert_eq!(3, f2(2, 1, N));
/// assert_eq!(8, f2(9, 1, N));
/// assert_eq!(1, f2(10, 1, N));
/// assert_eq!(0, f2(11, 1, N));
/// assert_eq!(3, f2(12, 1, N));
/// assert_eq!(8, f2(19, 1, N));
/// ```
#[inline]
pub fn f2(byte: u8, q: u8, n: u8) -> u8 {
if n == 0 {
return byte;
}
let mut result = byte ^ q;
// In almost all cases, the PHF works with the above constant-time operation.
// However, to crack a few difficult cases, we fall back to the linear-time
// operation shown below.
for _ in Q_FAST_MAX..q {
result = result ^ (result << 1) ^ (result >> 1);
}
result % n
}
/// A constant-time map from bytes to unique indices.
///
/// Uses a perfect hash function (see module-level documentation). Does not support mutation.
///
/// Standard layout: P, N bytes of Q, N bytes of expected keys
#[derive(Debug, PartialEq, Eq)]
#[repr(transparent)]
pub struct PerfectByteHashMap<Store: ?Sized>(Store);
impl<Store> PerfectByteHashMap<Store> {
/// Creates an instance from a pre-existing store. See [`Self::as_bytes`].
#[inline]
pub fn from_store(store: Store) -> Self {
Self(store)
}
}
impl<Store> PerfectByteHashMap<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets the usize for the given byte, or `None` if it is not in the map.
pub fn get(&self, key: u8) -> Option<usize> {
let (p, buffer) = self.0.as_ref().split_first()?;
// Note: there are N buckets followed by N keys
let n_usize = buffer.len() / 2;
if n_usize == 0 {
return None;
}
let n = n_usize as u8;
let (qq, eks) = buffer.debug_split_at(n_usize);
debug_assert_eq!(qq.len(), eks.len());
let l1 = f1(key, *p, n) as usize;
let q = debug_unwrap!(qq.get(l1), return None);
let l2 = f2(key, *q, n) as usize;
let ek = debug_unwrap!(eks.get(l2), return None);
if *ek == key {
Some(l2)
} else {
None
}
}
/// This is called `num_items` because `len` is ambiguous: it could refer
/// to the number of items or the number of bytes.
pub fn num_items(&self) -> usize {
self.0.as_ref().len() / 2
}
/// Get an iterator over the keys in the order in which they are stored in the map.
pub fn keys(&self) -> &[u8] {
let n = self.num_items();
self.0.as_ref().debug_split_at(1 + n).1
}
/// Diagnostic function that returns `p` and the maximum value of `q`
#[cfg(test)]
pub fn p_qmax(&self) -> Option<(u8, u8)> {
let (p, buffer) = self.0.as_ref().split_first()?;
let n = buffer.len() / 2;
if n == 0 {
return None;
}
let (qq, _) = buffer.debug_split_at(n);
Some((*p, *qq.iter().max().unwrap()))
}
/// Returns the map as bytes. The map can be recovered with [`Self::from_store`]
/// or [`Self::from_bytes`].
pub fn as_bytes(&self) -> &[u8] {
self.0.as_ref()
}
#[cfg(all(feature = "alloc", test))]
pub(crate) fn check(&self) -> Result<(), (&'static str, u8)> {
use alloc::vec;
let len = self.num_items();
let mut seen = vec![false; len];
for b in 0..=255u8 {
let get_result = self.get(b);
if self.keys().contains(&b) {
let i = get_result.ok_or(("expected to find", b))?;
if seen[i] {
return Err(("seen", b));
}
seen[i] = true;
} else if get_result.is_some() {
return Err(("did not expect to find", b));
}
}
Ok(())
}
}
impl PerfectByteHashMap<[u8]> {
/// Creates an instance from pre-existing bytes. See [`Self::as_bytes`].
#[inline]
pub fn from_bytes(bytes: &[u8]) -> &Self {
// Safety: Self is repr(transparent) over [u8]
unsafe { core::mem::transmute(bytes) }
}
}
impl<Store> PerfectByteHashMap<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Converts from `PerfectByteHashMap<AsRef<[u8]>>` to `&PerfectByteHashMap<[u8]>`
#[inline]
pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> {
PerfectByteHashMap::from_bytes(self.0.as_ref())
}
}
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::*;
use alloc::vec::Vec;
extern crate std;
fn random_alphanums(seed: u64, len: usize) -> Vec<u8> {
use rand::seq::SliceRandom;
use rand::SeedableRng;
let mut bytes: Vec<u8> =
b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789".into();
let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed);
bytes.partial_shuffle(&mut rng, len).0.into()
}
#[test]
fn test_smaller() {
let mut count_by_p = [0; 256];
let mut count_by_qmax = [0; 256];
for len in 1..16 {
for seed in 0..150 {
let keys = random_alphanums(seed, len);
let keys_str = core::str::from_utf8(&keys).unwrap();
let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
computed
.check()
.unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
let (p, qmax) = computed.p_qmax().unwrap();
count_by_p[p as usize] += 1;
count_by_qmax[qmax as usize] += 1;
}
}
std::println!("count_by_p (smaller): {count_by_p:?}");
std::println!("count_by_qmax (smaller): {count_by_qmax:?}");
let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
.iter()
.sum::<usize>();
std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
// Assert that 99% of cases resolve to the fast hash
assert!(count_fastq >= count_slowq * 100);
}
#[test]
fn test_larger() {
let mut count_by_p = [0; 256];
let mut count_by_qmax = [0; 256];
for len in 16..60 {
for seed in 0..75 {
let keys = random_alphanums(seed, len);
let keys_str = core::str::from_utf8(&keys).unwrap();
let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str);
computed
.check()
.unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str)));
let (p, qmax) = computed.p_qmax().unwrap();
count_by_p[p as usize] += 1;
count_by_qmax[qmax as usize] += 1;
}
}
std::println!("count_by_p (larger): {count_by_p:?}");
std::println!("count_by_qmax (larger): {count_by_qmax:?}");
let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::<usize>();
let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..]
.iter()
.sum::<usize>();
std::println!("fastq/slowq: {count_fastq}/{count_slowq}");
// Assert that 99% of cases resolve to the fast hash
assert!(count_fastq >= count_slowq * 100);
}
#[test]
fn test_hard_cases() {
let keys = [
0u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
126, 195, 196,
];
let computed = PerfectByteHashMap::try_new(&keys).unwrap();
let (p, qmax) = computed.p_qmax().unwrap();
assert_eq!(p, 69);
assert_eq!(qmax, 67);
}
#[test]
fn test_build_read_small() {
#[derive(Debug)]
struct TestCase<'a> {
keys: &'a str,
expected: &'a [u8],
reordered_keys: &'a str,
}
let cases = [
TestCase {
keys: "ab",
expected: &[0, 0, 0, b'b', b'a'],
reordered_keys: "ba",
},
TestCase {
keys: "abc",
expected: &[0, 0, 0, 0, b'c', b'a', b'b'],
reordered_keys: "cab",
},
TestCase {
// Note: splitting "a" and "c" into different buckets requires the heavier hash
// function because the difference between "a" and "c" is the period (2).
keys: "ac",
expected: &[1, 0, 1, b'c', b'a'],
reordered_keys: "ca",
},
TestCase {
keys: "aceg",
expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'],
reordered_keys: "eacg",
},
TestCase {
keys: "abd",
expected: &[0, 0, 1, 3, b'a', b'b', b'd'],
reordered_keys: "abd",
},
TestCase {
keys: "def",
expected: &[0, 0, 0, 0, b'f', b'd', b'e'],
reordered_keys: "fde",
},
TestCase {
keys: "fi",
expected: &[0, 0, 0, b'f', b'i'],
reordered_keys: "fi",
},
TestCase {
keys: "gh",
expected: &[0, 0, 0, b'h', b'g'],
reordered_keys: "hg",
},
TestCase {
keys: "lm",
expected: &[0, 0, 0, b'l', b'm'],
reordered_keys: "lm",
},
TestCase {
// Note: "a" and "q" (0x61 and 0x71) are very hard to split; only a handful of
// hash function crates can get them into separate buckets.
keys: "aq",
expected: &[4, 0, 1, b'a', b'q'],
reordered_keys: "aq",
},
TestCase {
keys: "xy",
expected: &[0, 0, 0, b'x', b'y'],
reordered_keys: "xy",
},
TestCase {
keys: "xyz",
expected: &[0, 0, 0, 0, b'x', b'y', b'z'],
reordered_keys: "xyz",
},
TestCase {
keys: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
expected: &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 16,
16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
2, 0, 7, 104, 105, 106, 107, 108, 109, 110, 111, 112, 117, 118, 119, 68, 69,
70, 113, 114, 65, 66, 67, 120, 121, 122, 115, 72, 73, 74, 71, 80, 81, 82, 83,
84, 85, 86, 87, 88, 89, 90, 75, 76, 77, 78, 79, 103, 97, 98, 99, 116, 100, 102,
101,
],
reordered_keys: "hijklmnopuvwDEFqrABCxyzsHIJGPQRSTUVWXYZKLMNOgabctdfe",
},
TestCase {
keys: "abcdefghij",
expected: &[
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 97, 98, 99,
],
reordered_keys: "defghijabc",
},
TestCase {
// This is a small case that resolves to the slow hasher
keys: "Jbej",
expected: &[2, 0, 0, 102, 0, b'j', b'e', b'b', b'J'],
reordered_keys: "jebJ",
},
TestCase {
// This is another small case that resolves to the slow hasher
keys: "JFNv",
expected: &[1, 98, 0, 2, 0, b'J', b'F', b'N', b'v'],
reordered_keys: "JFNv",
},
];
for cas in cases {
let computed = PerfectByteHashMap::try_new(cas.keys.as_bytes()).expect(cas.keys);
assert_eq!(computed.as_bytes(), cas.expected, "{cas:?}");
assert_eq!(computed.keys(), cas.reordered_keys.as_bytes(), "{cas:?}");
computed.check().expect(cas.keys);
}
}
}

491
vendor/zerotrie/src/cursor.rs vendored Normal file
View File

@@ -0,0 +1,491 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Types for walking stepwise through a trie.
//!
//! For examples, see the `.cursor()` functions
//! and the `Cursor` types in this module.
use crate::reader;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrieSimpleAscii;
use core::fmt;
impl<Store> ZeroTrieSimpleAscii<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets a cursor into the current trie.
///
/// Useful to query a trie with data that is not a slice.
///
/// This is currently supported only on [`ZeroTrieSimpleAscii`]
/// and [`ZeroAsciiIgnoreCaseTrie`].
///
/// # Examples
///
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Get out the value for "abc"
/// let mut cursor = trie.cursor();
/// write!(&mut cursor, "abc");
/// assert_eq!(cursor.take_value(), Some(0));
/// ```
///
/// Find the longest prefix match:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Find the longest prefix of the string "abcdxy":
/// let query = b"abcdxy";
/// let mut longest_prefix = 0;
/// let mut cursor = trie.cursor();
/// for (i, b) in query.iter().enumerate() {
/// // Checking is_empty() is not required, but it is
/// // good for efficiency
/// if cursor.is_empty() {
/// break;
/// }
/// if cursor.take_value().is_some() {
/// longest_prefix = i;
/// }
/// cursor.step(*b);
/// }
///
/// // The longest prefix is "abc" which is length 3:
/// assert_eq!(longest_prefix, 3);
/// ```
#[inline]
pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor<'_> {
ZeroTrieSimpleAsciiCursor {
trie: self.as_borrowed_slice(),
}
}
}
impl<Store> ZeroAsciiIgnoreCaseTrie<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Gets a cursor into the current trie.
///
/// Useful to query a trie with data that is not a slice.
///
/// This is currently supported only on [`ZeroTrieSimpleAscii`]
/// and [`ZeroAsciiIgnoreCaseTrie`].
///
/// # Examples
///
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor:
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// // A trie with two values: "aBc" and "aBcdEf"
/// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
///
/// // Get out the value for "abc" (case-insensitive!)
/// let mut cursor = trie.cursor();
/// write!(&mut cursor, "abc");
/// assert_eq!(cursor.take_value(), Some(0));
/// ```
///
/// For more examples, see [`ZeroTrieSimpleAscii::cursor`].
#[inline]
pub fn cursor(&self) -> ZeroAsciiIgnoreCaseTrieCursor<'_> {
ZeroAsciiIgnoreCaseTrieCursor {
trie: self.as_borrowed_slice(),
}
}
}
impl<'a> ZeroTrieSimpleAscii<&'a [u8]> {
/// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid
/// having to doubly anchor the trie to the stack.
#[inline]
pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> {
ZeroTrieSimpleAsciiCursor { trie: self }
}
}
impl<'a> ZeroAsciiIgnoreCaseTrie<&'a [u8]> {
/// Same as [`ZeroAsciiIgnoreCaseTrie::cursor()`] but moves self to avoid
/// having to doubly anchor the trie to the stack.
#[inline]
pub fn into_cursor(self) -> ZeroAsciiIgnoreCaseTrieCursor<'a> {
ZeroAsciiIgnoreCaseTrieCursor { trie: self }
}
}
/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup.
///
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroTrieSimpleAsciiCursor<'a> {
trie: ZeroTrieSimpleAscii<&'a [u8]>,
}
/// A cursor into a [`ZeroAsciiIgnoreCaseTrie`], useful for stepwise lookup.
///
/// For examples, see [`ZeroAsciiIgnoreCaseTrie::cursor()`].
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170>
#[derive(Debug, Clone)]
pub struct ZeroAsciiIgnoreCaseTrieCursor<'a> {
trie: ZeroAsciiIgnoreCaseTrie<&'a [u8]>,
}
/// Information about a probed edge.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive] // no need to destructure or construct this in userland
pub struct AsciiProbeResult {
/// The character's byte value between this node and its parent.
pub byte: u8,
/// The number of siblings of this node, _including itself_.
pub total_siblings: u8,
}
impl ZeroTrieSimpleAsciiCursor<'_> {
/// Steps the cursor one character into the trie based on the character's byte value.
///
/// # Examples
///
/// Unrolled loop checking for string presence at every step:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// // Search the trie for the string "abcdxy"
/// let mut cursor = trie.cursor();
/// assert_eq!(cursor.take_value(), None); // ""
/// cursor.step(b'a');
/// assert_eq!(cursor.take_value(), None); // "a"
/// cursor.step(b'b');
/// assert_eq!(cursor.take_value(), None); // "ab"
/// cursor.step(b'c');
/// assert_eq!(cursor.take_value(), Some(0)); // "abc"
/// cursor.step(b'd');
/// assert_eq!(cursor.take_value(), None); // "abcd"
/// assert!(!cursor.is_empty());
/// cursor.step(b'x'); // no strings have the prefix "abcdx"
/// assert!(cursor.is_empty());
/// assert_eq!(cursor.take_value(), None); // "abcdx"
/// cursor.step(b'y');
/// assert_eq!(cursor.take_value(), None); // "abcdxy"
/// ```
///
/// If the byte is not ASCII, the cursor will become empty:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// assert_eq!(cursor.take_value(), None); // ""
/// cursor.step(b'a');
/// assert_eq!(cursor.take_value(), None); // "a"
/// cursor.step(b'b');
/// assert_eq!(cursor.take_value(), None); // "ab"
/// cursor.step(b'\xFF');
/// assert!(cursor.is_empty());
/// assert_eq!(cursor.take_value(), None);
/// ```
#[inline]
pub fn step(&mut self, byte: u8) {
reader::step_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, byte);
}
/// Takes the value at the current position.
///
/// Calling this function on a new cursor is equivalent to calling `.get()`
/// with the empty string (except that it can only be called once).
///
/// # Examples
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "" and "abc"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81");
///
/// assert_eq!(Some(0), trie.get(""));
/// let mut cursor = trie.cursor();
/// assert_eq!(Some(0), cursor.take_value());
/// assert_eq!(None, cursor.take_value());
/// ```
#[inline]
pub fn take_value(&mut self) -> Option<usize> {
reader::take_value(&mut self.trie.store)
}
/// Steps the cursor one character into the trie based on an edge index,
/// returning the corresponding character as a byte.
///
/// This function is similar to [`Self::step()`], but it takes an index instead of a char.
/// This enables stepwise iteration over the contents of the trie.
///
/// If there are multiple possibilities for the next byte, the `index` argument allows
/// visiting them in order. Since this function steps the cursor, the cursor must be
/// cloned (a cheap operation) in order to visit multiple children.
///
/// # Examples
///
/// Continually query index 0 to extract the first item from a trie:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// let data: &[(String, usize)] = &[
/// ("ab".to_string(), 111),
/// ("abcxyz".to_string(), 22),
/// ("abde".to_string(), 333),
/// ("afg".to_string(), 44),
/// ];
///
/// let trie: ZeroTrieSimpleAscii<Vec<u8>> =
/// data.iter().map(|(s, v)| (s.as_str(), *v)).collect();
///
/// let mut cursor = trie.cursor();
/// let mut key = String::new();
/// let value = loop {
/// if let Some(value) = cursor.take_value() {
/// break value;
/// }
/// let probe_result = cursor.probe(0).unwrap();
/// key.push(char::from(probe_result.byte));
/// };
///
/// assert_eq!(key, "ab");
/// assert_eq!(value, 111);
/// ```
///
/// Stepwise iterate over all entries in the trie:
///
/// ```
/// # use zerotrie::ZeroTrieSimpleAscii;
/// # let data: &[(String, usize)] = &[
/// # ("ab".to_string(), 111),
/// # ("abcxyz".to_string(), 22),
/// # ("abde".to_string(), 333),
/// # ("afg".to_string(), 44)
/// # ];
/// # let trie: ZeroTrieSimpleAscii<Vec<u8>> = data
/// # .iter()
/// # .map(|(s, v)| (s.as_str(), *v))
/// # .collect();
/// // (trie built as in previous example)
///
/// // Initialize the iteration at the first child of the trie.
/// let mut stack = Vec::from([(trie.cursor(), 0, 0)]);
/// let mut key = Vec::new();
/// let mut results = Vec::new();
/// loop {
/// let Some((mut cursor, index, suffix_len)) = stack.pop() else {
/// // Nothing left in the trie.
/// break;
/// };
/// // Check to see if there is a value at the current node.
/// if let Some(value) = cursor.take_value() {
/// results.push((String::from_utf8(key.clone()).unwrap(), value));
/// }
/// // Now check for children of the current node.
/// let mut sub_cursor = cursor.clone();
/// if let Some(probe_result) = sub_cursor.probe(index) {
/// // Found a child. Add the current byte edge to the key.
/// key.push(probe_result.byte);
/// // Add the child to the stack, and also add back the current
/// // node if there are more siblings to visit.
/// if index + 1 < probe_result.total_siblings as usize {
/// stack.push((cursor, index + 1, suffix_len));
/// stack.push((sub_cursor, 0, 1));
/// } else {
/// stack.push((sub_cursor, 0, suffix_len + 1));
/// }
/// } else {
/// // No more children. Pop this node's bytes from the key.
/// for _ in 0..suffix_len {
/// key.pop();
/// }
/// }
/// }
///
/// assert_eq!(&results, data);
/// ```
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroTrieSimpleAscii<[u8]>>(&mut self.trie.store, index)
}
/// Checks whether the cursor points to an empty trie.
///
/// Use this to determine when to stop iterating.
#[inline]
pub fn is_empty(&self) -> bool {
self.trie.is_empty()
}
}
impl ZeroAsciiIgnoreCaseTrieCursor<'_> {
/// Steps the cursor one byte into the trie.
///
/// Returns the byte if matched, which may be a different case than the input byte.
/// If this function returns `None`, any lookup loops can be terminated.
///
/// # Examples
///
/// Normalize the case of a value by stepping through an ignore-case trie:
///
/// ```
/// use std::borrow::Cow;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// // A trie with two values: "aBc" and "aBcdEf"
/// let trie = ZeroAsciiIgnoreCaseTrie::from_bytes(b"aBc\x80dEf\x81");
///
/// // Get out the value for "abc" and normalize the key string
/// let mut cursor = trie.cursor();
/// let mut key_str = Cow::Borrowed("abc".as_bytes());
/// let mut i = 0;
/// let value = loop {
/// let Some(&input_byte) = key_str.get(i) else {
/// break cursor.take_value();
/// };
/// let Some(matched_byte) = cursor.step(input_byte) else {
/// break None;
/// };
/// if matched_byte != input_byte {
/// key_str.to_mut()[i] = matched_byte;
/// }
/// i += 1;
/// };
///
/// assert_eq!(value, Some(0));
/// assert_eq!(&*key_str, "aBc".as_bytes());
/// ```
///
/// For more examples, see [`ZeroTrieSimpleAsciiCursor::step`].
#[inline]
pub fn step(&mut self, byte: u8) -> Option<u8> {
reader::step_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, byte)
}
/// Takes the value at the current position.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::take_value`].
#[inline]
pub fn take_value(&mut self) -> Option<usize> {
reader::take_value(&mut self.trie.store)
}
/// Probes the next byte in the cursor.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::probe`].
pub fn probe(&mut self, index: usize) -> Option<AsciiProbeResult> {
reader::probe_parameterized::<ZeroAsciiIgnoreCaseTrie<[u8]>>(&mut self.trie.store, index)
}
/// Checks whether the cursor points to an empty trie.
///
/// For more details, see [`ZeroTrieSimpleAsciiCursor::is_empty`].
#[inline]
pub fn is_empty(&self) -> bool {
self.trie.is_empty()
}
}
impl fmt::Write for ZeroTrieSimpleAsciiCursor<'_> {
/// Steps the cursor through each ASCII byte of the string.
///
/// If the string contains non-ASCII chars, an error is returned.
///
/// # Examples
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// cursor.write_str("abcdxy").expect("all ASCII");
/// cursor.write_str("🚂").expect_err("non-ASCII");
/// ```
fn write_str(&mut self, s: &str) -> fmt::Result {
for b in s.bytes() {
if !b.is_ascii() {
return Err(fmt::Error);
}
self.step(b);
}
Ok(())
}
/// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns
/// an error if the char is non-ASCII.
///
/// # Examples
///
/// ```
/// use core::fmt::Write;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// let mut cursor = trie.cursor();
/// cursor.write_char('a').expect("ASCII");
/// cursor.write_char('x').expect("ASCII");
/// cursor.write_char('🚂').expect_err("non-ASCII");
/// ```
fn write_char(&mut self, c: char) -> fmt::Result {
if !c.is_ascii() {
return Err(fmt::Error);
}
self.step(c as u8);
Ok(())
}
}
impl fmt::Write for ZeroAsciiIgnoreCaseTrieCursor<'_> {
/// Steps the cursor through each ASCII byte of the string.
///
/// If the string contains non-ASCII chars, an error is returned.
fn write_str(&mut self, s: &str) -> fmt::Result {
for b in s.bytes() {
if !b.is_ascii() {
return Err(fmt::Error);
}
self.step(b);
}
Ok(())
}
/// Equivalent to [`ZeroAsciiIgnoreCaseTrieCursor::step()`], except returns
/// an error if the char is non-ASCII.
fn write_char(&mut self, c: char) -> fmt::Result {
if !c.is_ascii() {
return Err(fmt::Error);
}
self.step(c as u8);
Ok(())
}
}

25
vendor/zerotrie/src/error.rs vendored Normal file
View File

@@ -0,0 +1,25 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use displaydoc::Display;
/// Error types for the `zerotrie` crate.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Display)]
#[non_exhaustive]
pub enum ZeroTrieBuildError {
/// Non-ASCII data was added to an ASCII-only trie.
#[displaydoc("Non-ASCII cannot be added to an ASCII-only trie")]
NonAsciiError,
/// The trie reached its maximum supported capacity.
#[displaydoc("Reached maximum capacity of trie")]
CapacityExceeded,
/// The builder could not solve the perfect hash function.
#[displaydoc("Failed to solve the perfect hash function. This is rare! Please report your case to the ICU4X team.")]
CouldNotSolvePerfectHash,
/// Mixed-case data was added to a case-insensitive trie.
#[displaydoc("Mixed-case data added to case-insensitive trie")]
MixedCase,
}
impl core::error::Error for ZeroTrieBuildError {}

122
vendor/zerotrie/src/helpers.rs vendored Normal file
View File

@@ -0,0 +1,122 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
pub(crate) trait MaybeSplitAt<T> {
/// Like slice::split_at but debug-panics and returns an empty second slice
/// if the index is out of range.
fn debug_split_at(&self, mid: usize) -> (&Self, &Self);
}
impl<T> MaybeSplitAt<T> for [T] {
#[inline]
fn debug_split_at(&self, mid: usize) -> (&Self, &Self) {
self.split_at_checked(mid).unwrap_or_else(|| {
debug_assert!(false, "debug_split_at: {mid} expected to be in range");
(self, &[])
})
}
}
pub(crate) trait DebugUnwrapOr<T> {
/// Unwraps the option or panics in debug mode, returning the `gigo_value`
fn debug_unwrap_or(self, gigo_value: T) -> T;
}
impl<T> DebugUnwrapOr<T> for Option<T> {
#[inline]
fn debug_unwrap_or(self, gigo_value: T) -> T {
match self {
Some(x) => x,
None => {
debug_assert!(false, "debug_unwrap_or called on a None value");
gigo_value
}
}
}
}
macro_rules! debug_unwrap {
($expr:expr, return $retval:expr, $($arg:tt)+) => {
match $expr {
Some(x) => x,
None => {
debug_assert!(false, $($arg)*);
return $retval;
}
}
};
($expr:expr, return $retval:expr) => {
debug_unwrap!($expr, return $retval, "invalid trie")
};
($expr:expr, break, $($arg:tt)+) => {
match $expr {
Some(x) => x,
None => {
debug_assert!(false, $($arg)*);
break;
}
}
};
($expr:expr, break) => {
debug_unwrap!($expr, break, "invalid trie")
};
($expr:expr, $($arg:tt)+) => {
debug_unwrap!($expr, return (), $($arg)*)
};
($expr:expr) => {
debug_unwrap!($expr, return ())
};
}
pub(crate) use debug_unwrap;
/// The maximum number of base-10 digits required for rendering a usize.
/// Note: 24/10 is an approximation of 8*log10(2)
pub(crate) const MAX_USIZE_LEN_AS_DIGITS: usize = core::mem::size_of::<usize>() * 24 / 10 + 1;
/// Formats a usize as a string of length N, padded with spaces,
/// with the given prefix.
///
/// # Panics
///
/// If the string is too short, the function may panic. To prevent
/// this, N should be MAX_USIZE_LEN_AS_DIGITS larger than M.
#[allow(clippy::indexing_slicing)] // documented, and based on const parameters
pub(crate) const fn const_fmt_int<const M: usize, const N: usize>(
prefix: [u8; M],
value: usize,
) -> [u8; N] {
let mut output = [b' '; N];
let mut i = 0;
while i < M {
output[i] = prefix[i];
i += 1;
}
let mut int_only = [b' '; MAX_USIZE_LEN_AS_DIGITS];
let mut value = value;
let mut i = MAX_USIZE_LEN_AS_DIGITS - 1;
loop {
let x = (value % 10) as u8;
int_only[i] = x + b'0';
value /= 10;
if value == 0 {
break;
}
i -= 1;
}
let mut j = M;
while i < MAX_USIZE_LEN_AS_DIGITS {
output[j] = int_only[i];
j += 1;
i += 1;
}
output
}
#[test]
fn test_const_fmt_int() {
assert_eq!(*b"123", const_fmt_int::<0, 3>(*b"", 123));
assert_eq!(*b"123 ", const_fmt_int::<0, 6>(*b"", 123));
assert_eq!(*b"abc123", const_fmt_int::<3, 6>(*b"abc", 123));
}

87
vendor/zerotrie/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,87 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! A data structure offering zero-copy storage and retrieval of byte strings, with a focus
//! on the efficient storage of ASCII strings. Strings are mapped to `usize` values.
//!
//! [`ZeroTrie`] does not support mutation because doing so would require recomputing the entire
//! data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`].
//!
//! There are multiple variants of [`ZeroTrie`] optimized for different use cases.
//!
//! # Examples
//!
//! ```
//! use zerotrie::ZeroTrie;
//!
//! let data: &[(&str, usize)] = &[("abc", 11), ("xyz", 22), ("axyb", 33)];
//!
//! let trie: ZeroTrie<Vec<u8>> = data.iter().copied().collect();
//!
//! assert_eq!(trie.get("axyb"), Some(33));
//! assert_eq!(trie.byte_len(), 18);
//! ```
//!
//! # Internal Structure
//!
//! To read about the internal structure of [`ZeroTrie`], build the docs with private modules:
//!
//! ```bash
//! cargo doc --document-private-items --all-features --no-deps --open
//! ```
//!
//! [`LiteMap`]: litemap::LiteMap
//! [`BTreeMap`]: alloc::collections::BTreeMap
// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
#![cfg_attr(not(any(test, doc)), no_std)]
#![cfg_attr(
not(test),
deny(
clippy::indexing_slicing,
clippy::unwrap_used,
clippy::expect_used,
clippy::panic,
clippy::exhaustive_structs,
clippy::exhaustive_enums,
clippy::trivially_copy_pass_by_ref,
missing_debug_implementations,
)
)]
#![warn(missing_docs)]
#[cfg(feature = "alloc")]
extern crate alloc;
mod builder;
mod byte_phf;
pub mod cursor;
mod error;
#[macro_use]
mod helpers;
mod options;
mod reader;
#[cfg(feature = "serde")]
mod serde;
mod varint;
mod zerotrie;
pub use crate::zerotrie::ZeroAsciiIgnoreCaseTrie;
pub use crate::zerotrie::ZeroTrie;
pub use crate::zerotrie::ZeroTrieExtendedCapacity;
pub use crate::zerotrie::ZeroTriePerfectHash;
pub use crate::zerotrie::ZeroTrieSimpleAscii;
pub use error::ZeroTrieBuildError;
#[cfg(feature = "alloc")]
pub use crate::zerotrie::ZeroTrieStringIterator;
#[cfg(feature = "alloc")]
pub use reader::ZeroTrieIterator;
#[doc(hidden)]
pub mod _internal {
pub use crate::byte_phf::f1;
pub use crate::byte_phf::f2;
pub use crate::byte_phf::PerfectByteHashMap;
}

153
vendor/zerotrie/src/options.rs vendored Normal file
View File

@@ -0,0 +1,153 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Options for building and reading from a ZeroTrie.
//!
//! These options are internal to the crate. A small selection of options
//! are exported by way of the different public types on this crate.
/// Whether to use the perfect hash function in the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum PhfMode {
/// Use binary search for all branch nodes.
BinaryOnly,
/// Use the perfect hash function for large branch nodes.
UsePhf,
}
impl PhfMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::BinaryOnly => 0,
Self::UsePhf => 0x1,
}
}
}
/// Whether to support non-ASCII data in the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum AsciiMode {
/// Support only ASCII, returning an error if non-ASCII is found.
AsciiOnly,
/// Support all data, creating span nodes for non-ASCII bytes.
BinarySpans,
}
impl AsciiMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::AsciiOnly => 0,
Self::BinarySpans => 0x2,
}
}
}
/// Whether to enforce a limit to the capacity of the ZeroTrie.
#[derive(Copy, Clone)]
pub(crate) enum CapacityMode {
/// Return an error if the trie requires a branch of more than 2^32 bytes.
Normal,
/// Construct the trie without returning an error.
Extended,
}
impl CapacityMode {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::Normal => 0,
Self::Extended => 0x4,
}
}
}
/// How to handle strings with mixed ASCII case at a node, such as "abc" and "Abc"
#[derive(Copy, Clone)]
pub(crate) enum CaseSensitivity {
/// Allow all strings and sort them by byte value.
Sensitive,
/// Reject strings with different case and sort them as if `to_ascii_lowercase` is called.
IgnoreCase,
}
impl CaseSensitivity {
#[cfg(feature = "serde")]
const fn to_u8_flag(self) -> u8 {
match self {
Self::Sensitive => 0,
Self::IgnoreCase => 0x8,
}
}
}
#[derive(Copy, Clone)]
pub(crate) struct ZeroTrieBuilderOptions {
pub phf_mode: PhfMode,
pub ascii_mode: AsciiMode,
pub capacity_mode: CapacityMode,
pub case_sensitivity: CaseSensitivity,
}
impl ZeroTrieBuilderOptions {
#[cfg(feature = "serde")]
pub(crate) const fn to_u8_flags(self) -> u8 {
self.phf_mode.to_u8_flag()
| self.ascii_mode.to_u8_flag()
| self.capacity_mode.to_u8_flag()
| self.case_sensitivity.to_u8_flag()
}
}
pub(crate) trait ZeroTrieWithOptions {
const OPTIONS: ZeroTrieBuilderOptions;
}
/// All branch nodes are binary search
/// and there are no span nodes.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}
impl<S: ?Sized> crate::ZeroTrieSimpleAscii<S> {
#[cfg(feature = "serde")]
pub(crate) const FLAGS: u8 = Self::OPTIONS.to_u8_flags();
}
/// All branch nodes are binary search
/// and nodes use case-insensitive matching.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::BinaryOnly,
ascii_mode: AsciiMode::AsciiOnly,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::IgnoreCase,
};
}
/// Branch nodes could be either binary search or PHF.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTriePerfectHash<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Normal,
case_sensitivity: CaseSensitivity::Sensitive,
};
}
/// No limited capacity assertion.
impl<S: ?Sized> ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity<S> {
const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions {
phf_mode: PhfMode::UsePhf,
ascii_mode: AsciiMode::BinarySpans,
capacity_mode: CapacityMode::Extended,
case_sensitivity: CaseSensitivity::Sensitive,
};
}

731
vendor/zerotrie/src/reader.rs vendored Normal file
View File

@@ -0,0 +1,731 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! # Internal layout of ZeroTrie
//!
//! A ZeroTrie is composed of a series of nodes stored in sequence in a byte slice.
//!
//! There are 4 types of nodes:
//!
//! 1. ASCII (`0xxxxxxx`): matches a literal ASCII byte.
//! 2. Span (`101xxxxx`): matches a span of non-ASCII bytes.
//! 3. Value (`100xxxxx`): associates a value with a string
//! 4. Branch (`11xxxxxx`): matches one of a set of bytes.
//!
//! Span, Value, and Branch nodes contain a varint, which has different semantics for each:
//!
//! - Span varint: length of the span
//! - Value varint: value associated with the string
//! - Branch varint: number of edges in the branch and width of the offset table
//!
//! If reading an ASCII, Span, or Branch node, one or more bytes are consumed from the input
//! string. If the next byte(s) in the input string do not match the node, we return `None`.
//! If reading a Value node, if the string is empty, return `Some(value)`; otherwise, we skip
//! the Value node and continue on to the next node.
//!
//! When a node is consumed, a shorter, well-formed ZeroTrie remains.
//!
//! ### Basic Example
//!
//! Here is an example ZeroTrie without branch nodes:
//!
//! ```
//! use zerotrie::ZeroTriePerfectHash;
//!
//! let bytes = [
//! b'a', // ASCII literal
//! 0b10001010, // value 10
//! b'b', // ASCII literal
//! 0b10100011, // span of 3
//! 0x81, // first byte in span
//! 0x91, // second byte in span
//! 0xA1, // third and final byte in span
//! 0b10000100, // value 4
//! ];
//!
//! let trie = ZeroTriePerfectHash::from_bytes(&bytes);
//!
//! // First value: "a" → 10
//! assert_eq!(trie.get(b"a"), Some(10));
//!
//! // Second value: "ab\x81\x91\xA1" → 4
//! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4));
//!
//! // A few examples of strings that do NOT have values in the trie:
//! assert_eq!(trie.get(b"ab"), None);
//! assert_eq!(trie.get(b"b"), None);
//! assert_eq!(trie.get(b"b\x81\x91\xA1"), None);
//! ```
//!
//! ## Branch Nodes
//!
//! There are two types of branch nodes: binary search and perfect hash. `ZeroTrieSimpleAscii`
//! contains only binary search nodes, whereas `ZeroTriePerfectHash` can contain either.
//!
//! The head node of the branch has a varint that encodes two things:
//!
//! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256
//! - Bits 9 and 10: width of the offset table (`W`)
//!
//! Note that N is always in the range [1, 256]. There can't be more than 256 edges because
//! there are only 256 unique u8 values.
//!
//! A few examples of the head node of the branch:
//!
//! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0
//! - `0b11000110`: varint bits `110`: N = 6; W = 0
//! - `0b11100000 0b00000101`: varint bits `1000101`: N = 69; W = 0
//! - `0b11100010 0b00000000`: varint bits `101000000`: N = 64; W = 1
//!
//! In `ZeroTriePerfectHash`, if N <= 15, the branch is assumed to be a binary search, and if
//! N > 15, the branch is assumed to be a perfect hash.
//!
//! ### Binary Search Branch Nodes
//!
//! A binary search branch node is used when:
//!
//! 1. The trie is a `ZeroTrieSimpleAscii`, OR
//! 2. There are 15 or fewer items in the branch.
//!
//! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte
//! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search),
//! the index `i` of the byte within the list is used to index into the offset table (described
//! below). If the byte is not in the list, the string is not in the trie, so return `None`.
//!
//! ### Perfect Hash Branch Nodes
//!
//! A perfect hash branch node is used when:
//!
//! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND
//! 2. There are 16 or more items in the branch.
//!
//! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing
//! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an
//! index within the hash table `i` is resolved and used as input to index into the offset
//! table (described below), or the value is determined to not be present and `None` is
//! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`].
//!
//! ### Offset Tables
//!
//! The _offset table_ encodes the range of the remaining buffer containing the trie reachable
//! from the byte matched in the branch node. Both types of branch nodes include an offset
//! table followig the key lookup. Given the index `i` from the first step, the range
//! `[s_i, s_(i+1))` brackets the next step in the trie.
//!
//! Offset tables utilize the `W` parameter stored in the branch head node. The special case
//! when `W == 0`, with `N - 1` bytes, is easiest to understand:
//!
//! **Offset table, W = 0:** `[s_1, s_2, ..., s_(N-1)]`
//!
//! Note that `s_0` is always 0 and `s_N` is always the length of the remaining slice, so those
//! values are not explicitly included in the offset table.
//!
//! When W > 0, the high and low bits of the offsets are in separate bytes, arranged as follows:
//!
//! **Generalized offset table:** `[a_1, a_2, ..., a_(N-1), b_1, b_2, ..., b_(N-1), c_1, ...]`
//!
//! where `s_i = (a_i << 8 + b_i) << 8 + c_i ...` (high bits first, low bits last)
//!
//! ### Advanced Example
//!
//! The following trie encodes the following map. It has multiple varints and branch nodes, which
//! are all binary search with W = 0. Note that there is a value for the empty string.
//!
//! - "" → 0
//! - "axb" → 100
//! - "ayc" → 2
//! - "azd" → 3
//! - "bxe" → 4
//! - "bxefg" → 500
//! - "bxefh" → 6
//! - "bxei" → 7
//! - "bxeikl" → 8
//!
//! ```
//! use zerotrie::ZeroTrieSimpleAscii;
//!
//! let bytes = [
//! 0b10000000, // value 0
//! 0b11000010, // branch of 2
//! b'a', //
//! b'b', //
//! 13, //
//! 0b11000011, // start of 'a' subtree: branch of 3
//! b'x', //
//! b'y', //
//! b'z', //
//! 3, //
//! 5, //
//! b'b', //
//! 0b10010000, // value 100 (lead)
//! 0x54, // value 100 (trail)
//! b'c', //
//! 0b10000010, // value 2
//! b'd', //
//! 0b10000011, // value 3
//! b'x', // start of 'b' subtree
//! b'e', //
//! 0b10000100, // value 4
//! 0b11000010, // branch of 2
//! b'f', //
//! b'i', //
//! 7, //
//! 0b11000010, // branch of 2
//! b'g', //
//! b'h', //
//! 2, //
//! 0b10010011, // value 500 (lead)
//! 0x64, // value 500 (trail)
//! 0b10000110, // value 6
//! 0b10000111, // value 7
//! b'k', //
//! b'l', //
//! 0b10001000, // value 8
//! ];
//!
//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes);
//!
//! // Assert that the specified items are in the map
//! assert_eq!(trie.get(b""), Some(0));
//! assert_eq!(trie.get(b"axb"), Some(100));
//! assert_eq!(trie.get(b"ayc"), Some(2));
//! assert_eq!(trie.get(b"azd"), Some(3));
//! assert_eq!(trie.get(b"bxe"), Some(4));
//! assert_eq!(trie.get(b"bxefg"), Some(500));
//! assert_eq!(trie.get(b"bxefh"), Some(6));
//! assert_eq!(trie.get(b"bxei"), Some(7));
//! assert_eq!(trie.get(b"bxeikl"), Some(8));
//!
//! // Assert that some other items are not in the map
//! assert_eq!(trie.get(b"a"), None);
//! assert_eq!(trie.get(b"bx"), None);
//! assert_eq!(trie.get(b"xba"), None);
//! ```
use crate::byte_phf::PerfectByteHashMap;
use crate::cursor::AsciiProbeResult;
use crate::helpers::*;
use crate::options::*;
use crate::varint::read_varint_meta2;
use crate::varint::read_varint_meta3;
#[cfg(feature = "alloc")]
use alloc::string::String;
/// Given a slice starting with an offset table, returns the trie for the given index.
///
/// Arguments:
/// - `trie` = a trie pointing at an offset table (after the branch node and search table)
/// - `i` = the desired index within the offset table
/// - `n` = the number of items in the offset table
/// - `w` = the width of the offset table items minus one
#[inline]
fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> &[u8] {
let mut p = 0usize;
let mut q = 0usize;
loop {
let indices;
(indices, trie) = trie.debug_split_at(n - 1);
p = (p << 8)
+ if i == 0 {
0
} else {
*indices.get(i - 1).debug_unwrap_or(&0) as usize
};
q = match indices.get(i) {
Some(x) => (q << 8) + *x as usize,
None => trie.len(),
};
if w == 0 {
break;
}
w -= 1;
}
trie.get(p..q).debug_unwrap_or(&[])
}
/// Version of [`get_branch()`] specialized for the case `w == 0` for performance
#[inline]
fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> &[u8] {
let indices;
(indices, trie) = trie.debug_split_at(n - 1);
let p = if i == 0 {
0
} else {
*indices.get(i - 1).debug_unwrap_or(&0) as usize
};
let q = match indices.get(i) {
Some(x) => *x as usize,
None => trie.len(),
};
trie.get(p..q).debug_unwrap_or(&[])
}
/// The node type. See the module-level docs for more explanation of the four node types.
enum NodeType {
/// An ASCII node. Contains a single literal ASCII byte and no varint.
Ascii,
/// A span node. Contains a varint indicating how big the span is.
Span,
/// A value node. Contains a varint representing the value.
Value,
/// A branch node. Contains a varint of the number of output nodes, plus W in the high bits.
Branch,
}
impl core::fmt::Debug for NodeType {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
use NodeType::*;
f.write_str(match *self {
Ascii => "a",
Span => "s",
Value => "v",
Branch => "m",
})
}
}
#[inline]
fn byte_type(b: u8) -> NodeType {
match b & 0b11100000 {
0b10000000 => NodeType::Value,
0b10100000 => NodeType::Span,
0b11000000 => NodeType::Branch,
0b11100000 => NodeType::Branch,
_ => NodeType::Ascii,
}
}
#[inline]
pub(crate) fn get_parameterized<T: ZeroTrieWithOptions + ?Sized>(
mut trie: &[u8],
mut ascii: &[u8],
) -> Option<usize> {
loop {
let (b, x, i, search);
(b, trie) = trie.split_first()?;
let byte_type = byte_type(*b);
(x, trie) = match byte_type {
NodeType::Ascii => (0, trie),
NodeType::Span => {
if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) {
read_varint_meta3(*b, trie)
} else {
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
}
NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Branch => read_varint_meta2(*b, trie),
};
if let Some((c, temp)) = ascii.split_first() {
if matches!(byte_type, NodeType::Ascii) {
let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase)
{
b.eq_ignore_ascii_case(c)
} else {
b == c
};
if is_match {
// Matched a byte
ascii = temp;
continue;
} else {
// Byte that doesn't match
return None;
}
}
if matches!(byte_type, NodeType::Value) {
// Value node, but not at end of string
continue;
}
if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans)
&& matches!(byte_type, NodeType::Span)
{
let (trie_span, ascii_span);
(trie_span, trie) = trie.debug_split_at(x);
(ascii_span, ascii) = ascii.split_at_checked(x)?;
if trie_span == ascii_span {
// Matched a byte span
continue;
} else {
// Byte span that doesn't match
return None;
}
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
let w = if matches!(T::OPTIONS.capacity_mode, CapacityMode::Extended) {
w
} else {
// See the table below regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
w & 0x3
};
let x = if x == 0 { 256 } else { x };
if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 {
// binary search
(search, trie) = trie.debug_split_at(x);
let bsearch_result =
if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) {
search.binary_search_by_key(&c.to_ascii_lowercase(), |x| {
x.to_ascii_lowercase()
})
} else {
search.binary_search(c)
};
i = bsearch_result.ok()?;
} else {
// phf
(search, trie) = trie.debug_split_at(x * 2 + 1);
i = PerfectByteHashMap::from_store(search).get(*c)?;
}
trie = if w == 0 {
get_branch_w0(trie, i, x)
} else {
get_branch(trie, i, x, w)
};
ascii = temp;
continue;
} else {
if matches!(byte_type, NodeType::Value) {
// Value node at end of string
return Some(x);
}
return None;
}
}
}
// DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w.
//
// | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 |
// |---------------|-------------------|--------------------|-------------------|--------------------|
// | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns |
// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs |
// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs |
/// Steps one node into the trie assuming all branch nodes are binary search and that
/// there are no span nodes.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie reachable by `c`.
#[inline]
pub(crate) fn step_parameterized<T: ZeroTrieWithOptions + ?Sized>(
trie: &mut &[u8],
c: u8,
) -> Option<u8> {
// Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
// `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
// If a span node is encountered, `None` is returned later in this function.
debug_assert!(
matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly),
"Spans not yet implemented in step function"
);
// PHF can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly),
"PHF not yet implemented in step function"
);
// Extended Capacity can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal),
"Extended capacity not yet implemented in step function"
);
let (mut b, x, search);
loop {
(b, *trie) = match trie.split_first() {
Some(v) => v,
None => {
// Empty trie or only a value node
return None;
}
};
match byte_type(*b) {
NodeType::Ascii => {
let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase)
{
b.eq_ignore_ascii_case(&c)
} else {
*b == c
};
if is_match {
// Matched a byte
return Some(*b);
} else {
// Byte that doesn't match
*trie = &[];
return None;
}
}
NodeType::Branch => {
// Proceed to the branch node logic below
(x, *trie) = read_varint_meta2(*b, trie);
break;
}
NodeType::Span => {
// Question: Should we put the trie back into a valid state?
// Currently this code is unreachable so let's not worry about it.
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
NodeType::Value => {
// Skip the value node and go to the next node
(_, *trie) = read_varint_meta3(*b, trie);
continue;
}
};
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
// See comment above regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
let w = w & 0x3;
let x = if x == 0 { 256 } else { x };
// Always use binary search
(search, *trie) = trie.debug_split_at(x);
let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) {
search.binary_search_by_key(&c.to_ascii_lowercase(), |x| x.to_ascii_lowercase())
} else {
search.binary_search(&c)
};
match bsearch_result {
Ok(i) => {
// Matched a byte
*trie = if w == 0 {
get_branch_w0(trie, i, x)
} else {
get_branch(trie, i, x, w)
};
#[allow(clippy::indexing_slicing)] // i is from a binary search
Some(search[i])
}
Err(_) => {
// Byte that doesn't match
*trie = &[];
None
}
}
}
/// Steps one node into the trie, assuming all branch nodes are binary search and that
/// there are no span nodes, using an index.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie indexed by `index`.
#[inline]
pub(crate) fn probe_parameterized<T: ZeroTrieWithOptions + ?Sized>(
trie: &mut &[u8],
index: usize,
) -> Option<AsciiProbeResult> {
// Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`.
// `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie.
// If a span node is encountered, `None` is returned later in this function.
debug_assert!(
matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly),
"Spans not yet implemented in step function"
);
// PHF can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly),
"PHF not yet implemented in step function"
);
// Extended Capacity can be easily implemented but the code is not yet reachable
debug_assert!(
matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal),
"Extended capacity not yet implemented in step function"
);
let (mut b, x, search);
loop {
(b, *trie) = match trie.split_first() {
Some(v) => v,
None => {
// Empty trie or only a value node
return None;
}
};
match byte_type(*b) {
NodeType::Ascii => {
if index > 0 {
*trie = &[];
return None;
}
return Some(AsciiProbeResult {
byte: *b,
total_siblings: 1,
});
}
NodeType::Branch => {
// Proceed to the branch node logic below
(x, *trie) = read_varint_meta2(*b, trie);
break;
}
NodeType::Span => {
// Question: Should we put the trie back into a valid state?
// Currently this code is unreachable so let's not worry about it.
debug_assert!(false, "Span node found in ASCII trie!");
return None;
}
NodeType::Value => {
// Skip the value node and go to the next node
(_, *trie) = read_varint_meta3(*b, trie);
continue;
}
};
}
// Branch node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
debug_assert!(u8::try_from(x).is_ok());
let total_siblings = x as u8;
// See comment above regarding this assertion
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3");
let w = w & 0x3;
let x = if x == 0 { 256 } else { x };
if index >= x {
*trie = &[];
return None;
}
(search, *trie) = trie.debug_split_at(x);
*trie = if w == 0 {
get_branch_w0(trie, index, x)
} else {
get_branch(trie, index, x, w)
};
Some(AsciiProbeResult {
#[allow(clippy::indexing_slicing)] // index < x, the length of search
byte: search[index],
total_siblings,
})
}
/// Steps one node into the trie if the head node is a value node, returning the value.
/// If the head node is not a value node, no change is made.
///
/// The input-output argument `trie` starts at the original trie and ends pointing to
/// the sub-trie with the value node removed.
pub(crate) fn take_value(trie: &mut &[u8]) -> Option<usize> {
let (b, new_trie) = trie.split_first()?;
match byte_type(*b) {
NodeType::Ascii | NodeType::Span | NodeType::Branch => None,
NodeType::Value => {
let x;
(x, *trie) = read_varint_meta3(*b, new_trie);
Some(x)
}
}
}
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
/// Iterator type for walking the byte sequences contained in a ZeroTrie.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
#[cfg(feature = "alloc")]
#[derive(Debug)]
pub struct ZeroTrieIterator<'a> {
/// Whether the PHF is enabled on this trie.
use_phf: bool,
/// Intermediate state during iteration:
/// 1. A trie (usually a slice of the original, bigger trie)
/// 2. The string that leads to the trie
/// 3. If the trie's lead node is a branch node, the current index being evaluated
state: Vec<(&'a [u8], Vec<u8>, usize)>,
}
#[cfg(feature = "alloc")]
impl<'a> ZeroTrieIterator<'a> {
pub(crate) fn new<S: AsRef<[u8]> + ?Sized>(store: &'a S, use_phf: bool) -> Self {
ZeroTrieIterator {
use_phf,
state: alloc::vec![(store.as_ref(), alloc::vec![], 0)],
}
}
}
#[cfg(feature = "alloc")]
impl Iterator for ZeroTrieIterator<'_> {
type Item = (Vec<u8>, usize);
fn next(&mut self) -> Option<Self::Item> {
let (mut trie, mut string, mut branch_idx);
(trie, string, branch_idx) = self.state.pop()?;
loop {
let (b, x, span, search);
let return_trie = trie;
(b, trie) = match trie.split_first() {
Some(tpl) => tpl,
None => {
// At end of current branch; step back to the branch node.
// If there are no more branches, we are finished.
(trie, string, branch_idx) = self.state.pop()?;
continue;
}
};
let byte_type = byte_type(*b);
if matches!(byte_type, NodeType::Ascii) {
string.push(*b);
continue;
}
(x, trie) = match byte_type {
NodeType::Ascii => (0, trie),
NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie),
NodeType::Branch => read_varint_meta2(*b, trie),
};
if matches!(byte_type, NodeType::Span) {
(span, trie) = trie.debug_split_at(x);
string.extend(span);
continue;
}
if matches!(byte_type, NodeType::Value) {
let retval = string.clone();
// Return to this position on the next step
self.state.push((trie, string, 0));
return Some((retval, x));
}
// Match node
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) };
let x = if x == 0 { 256 } else { x };
if branch_idx + 1 < x {
// Return to this branch node at the next index
self.state
.push((return_trie, string.clone(), branch_idx + 1));
}
let byte = if x < 16 || !self.use_phf {
// binary search
(search, trie) = trie.debug_split_at(x);
debug_unwrap!(search.get(branch_idx), return None)
} else {
// phf
(search, trie) = trie.debug_split_at(x * 2 + 1);
debug_unwrap!(search.get(branch_idx + x + 1), return None)
};
string.push(*byte);
trie = if w == 0 {
get_branch_w0(trie, branch_idx, x)
} else {
get_branch(trie, branch_idx, x, w)
};
branch_idx = 0;
}
}
}
#[cfg(feature = "alloc")]
pub(crate) fn get_iter_phf<S: AsRef<[u8]> + ?Sized>(store: &S) -> ZeroTrieIterator<'_> {
ZeroTrieIterator::new(store, true)
}
/// # Panics
/// Panics if the trie contains non-ASCII items.
#[cfg(feature = "alloc")]
#[expect(clippy::type_complexity)]
pub(crate) fn get_iter_ascii_or_panic<S: AsRef<[u8]> + ?Sized>(
store: &S,
) -> core::iter::Map<ZeroTrieIterator<'_>, fn((Vec<u8>, usize)) -> (String, usize)> {
ZeroTrieIterator::new(store, false).map(|(k, v)| {
#[expect(clippy::unwrap_used)] // in signature of function
let ascii_str = String::from_utf8(k).unwrap();
(ascii_str, v)
})
}

644
vendor/zerotrie/src/serde.rs vendored Normal file
View File

@@ -0,0 +1,644 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::builder::bytestr::ByteStr;
use crate::options::ZeroTrieWithOptions;
use crate::zerotrie::ZeroTrieFlavor;
use crate::ZeroAsciiIgnoreCaseTrie;
use crate::ZeroTrie;
use crate::ZeroTrieExtendedCapacity;
use crate::ZeroTriePerfectHash;
use crate::ZeroTrieSimpleAscii;
use alloc::boxed::Box;
use alloc::vec::Vec;
use core::fmt;
use litemap::LiteMap;
use serde_core::de::Error;
use serde_core::de::Visitor;
use serde_core::Deserialize;
use serde_core::Deserializer;
use serde_core::Serialize;
use serde_core::Serializer;
struct ByteStrVisitor;
impl<'de> Visitor<'de> for ByteStrVisitor {
type Value = Box<[u8]>;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "a slice of borrowed bytes or a string")
}
fn visit_bytes<E>(self, v: &[u8]) -> Result<Self::Value, E> {
Ok(Box::from(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Box::from(v.as_bytes()))
}
fn visit_seq<A>(self, mut v: A) -> Result<Self::Value, A::Error>
where
A: serde_core::de::SeqAccess<'de>,
{
let mut result = Vec::with_capacity(v.size_hint().unwrap_or(0));
while let Some(x) = v.next_element::<u8>()? {
result.push(x);
}
Ok(Box::from(result))
}
}
impl<'data, 'de: 'data> Deserialize<'de> for &'data ByteStr {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = <&'data [u8]>::deserialize(deserializer)?;
Ok(ByteStr::from_bytes(s))
}
}
impl<'de> Deserialize<'de> for Box<ByteStr> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let s = deserializer.deserialize_any(ByteStrVisitor)?;
Ok(ByteStr::from_boxed_bytes(s))
} else {
let s = Vec::<u8>::deserialize(deserializer)?;
Ok(ByteStr::from_boxed_bytes(s.into_boxed_slice()))
}
}
}
impl Serialize for &ByteStr {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let bytes = self.as_bytes();
if serializer.is_human_readable() {
match core::str::from_utf8(bytes) {
Ok(s) => serializer.serialize_str(s),
Err(_) => serializer.serialize_bytes(bytes),
}
} else {
serializer.serialize_bytes(bytes)
}
}
}
impl<'data, 'de: 'data, Store> Deserialize<'de> for ZeroTrieSimpleAscii<Store>
where
// DISCUSS: There are several possibilities for the bounds here that would
// get the job done. I could look for Deserialize, but this would require
// creating a custom Deserializer for the map case. I also considered
// introducing a new trait instead of relying on From.
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrieSimpleAscii::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
};
Ok(ZeroTrieSimpleAscii::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroTrieSimpleAscii<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(Self::FLAGS, ByteStr::from_bytes(self.as_bytes())).serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroAsciiIgnoreCaseTrie<Store>
where
'de: 'data,
// DISCUSS: There are several possibilities for the bounds here that would
// get the job done. I could look for Deserialize, but this would require
// creating a custom Deserializer for the map case. I also considered
// introducing a new trait instead of relying on From.
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroAsciiIgnoreCaseTrie::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroAsciiIgnoreCaseTrie::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroAsciiIgnoreCaseTrie<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTriePerfectHash<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTriePerfectHash::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroTriePerfectHash::from_store(Store::from(trie_bytes)))
}
}
}
impl<Store> Serialize for ZeroTriePerfectHash<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTrieExtendedCapacity<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrieExtendedCapacity::try_from_serde_litemap(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let (flags, trie_bytes) = <(u8, &[u8])>::deserialize(deserializer)?;
if Self::OPTIONS.to_u8_flags() != flags {
return Err(D::Error::custom("invalid ZeroTrie tag"));
}
Ok(ZeroTrieExtendedCapacity::from_store(Store::from(
trie_bytes,
)))
}
}
}
impl<Store> Serialize for ZeroTrieExtendedCapacity<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
// Note: `impl Serialize for ByteStr` uses `serialize_bytes`
(
Self::OPTIONS.to_u8_flags(),
ByteStr::from_bytes(self.as_bytes()),
)
.serialize(serializer)
}
}
}
impl<'de, 'data, Store> Deserialize<'de> for ZeroTrie<Store>
where
'de: 'data,
Store: From<&'data [u8]> + From<Vec<u8>> + 'data,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
if deserializer.is_human_readable() {
let lm = LiteMap::<Box<ByteStr>, usize>::deserialize(deserializer)?;
ZeroTrie::<Vec<u8>>::try_from(&lm)
.map_err(D::Error::custom)
.map(|trie| trie.convert_store())
} else {
// Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes
let bytes = <&[u8]>::deserialize(deserializer)?;
let (tag, trie_bytes) = bytes
.split_first()
.ok_or(D::Error::custom("expected at least 1 byte for ZeroTrie"))?;
let store = Store::from(trie_bytes);
let zerotrie = if *tag == ZeroTrieSimpleAscii::<u8>::OPTIONS.to_u8_flags() {
ZeroTrieSimpleAscii::from_store(store).into_zerotrie()
} else if *tag == ZeroTriePerfectHash::<u8>::OPTIONS.to_u8_flags() {
ZeroTriePerfectHash::from_store(store).into_zerotrie()
} else if *tag == ZeroTrieExtendedCapacity::<u8>::OPTIONS.to_u8_flags() {
ZeroTrieExtendedCapacity::from_store(store).into_zerotrie()
} else {
return Err(D::Error::custom("invalid ZeroTrie tag"));
};
Ok(zerotrie)
}
}
}
impl<Store> Serialize for ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
if serializer.is_human_readable() {
let lm = self.to_litemap();
let lm = lm
.iter()
.map(|(k, v)| (ByteStr::from_bytes(k), v))
.collect::<LiteMap<_, _>>();
lm.serialize(serializer)
} else {
let (tag, bytes) = match &self.0 {
ZeroTrieFlavor::SimpleAscii(t) => (
ZeroTrieSimpleAscii::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
ZeroTrieFlavor::PerfectHash(t) => (
ZeroTriePerfectHash::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
ZeroTrieFlavor::ExtendedCapacity(t) => (
ZeroTrieExtendedCapacity::<u8>::OPTIONS.to_u8_flags(),
t.as_bytes(),
),
};
let mut all_in_one_vec = Vec::with_capacity(bytes.len() + 1);
all_in_one_vec.push(tag);
all_in_one_vec.extend(bytes);
serializer.serialize_bytes(&all_in_one_vec)
}
}
}
#[cfg(test)]
mod testdata {
include!("../tests/data/data.rs");
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::borrow::Cow;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieSimpleAsciiCow<'a> {
#[serde(borrow)]
trie: ZeroTrieSimpleAscii<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_simpleascii_cow() {
let trie = ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroTrieSimpleAsciiCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
let rmp_bytes = rmp_serde::to_vec(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[0, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
assert_eq!(&rmp_bytes[0..5], &[145, 146, 0, 196, 26]);
assert_eq!(&rmp_bytes[5..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieSimpleAsciiCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieSimpleAsciiCow =
bincode::deserialize(&bincode_bytes).unwrap();
let rmp_recovered: ZeroTrieSimpleAsciiCow = rmp_serde::from_slice(&rmp_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert_eq!(original.trie, rmp_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroAsciiIgnoreCaseTrieCow<'a> {
#[serde(borrow)]
trie: ZeroAsciiIgnoreCaseTrie<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_asciiignorecase_cow() {
let trie = ZeroAsciiIgnoreCaseTrie::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroAsciiIgnoreCaseTrieCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[8, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroAsciiIgnoreCaseTrieCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroAsciiIgnoreCaseTrieCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTriePerfectHashCow<'a> {
#[serde(borrow)]
trie: ZeroTriePerfectHash<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_perfecthash_cow() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_ASCII));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_perfecthash_cow_u() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE);
assert_eq!(&bincode_bytes[0..9], &[3, 39, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_UNICODE);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_perfecthash_cow_bin() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_BINARY));
let original = ZeroTriePerfectHashCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_BINARY);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_BINARY);
let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashCow =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieAnyCow<'a> {
#[serde(borrow)]
trie: ZeroTrie<Cow<'a, [u8]>>,
}
#[test]
pub fn test_serde_any_cow() {
let trie =
ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII)).into_zerotrie();
let original = ZeroTrieAnyCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[27, 0, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
#[test]
pub fn test_serde_any_cow_u() {
let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE))
.into_zerotrie();
let original = ZeroTrieAnyCow { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE);
assert_eq!(&bincode_bytes[0..9], &[40, 0, 0, 0, 0, 0, 0, 0, 3]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_UNICODE);
let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(matches!(json_recovered.trie.into_store(), Cow::Owned(_)));
assert!(matches!(
bincode_recovered.trie.into_store(),
Cow::Borrowed(_)
));
}
}
#[cfg(test)]
#[cfg(feature = "zerovec")]
mod tests_zerovec {
use super::*;
use serde::{Deserialize, Serialize};
use zerovec::ZeroVec;
#[derive(Serialize, Deserialize)]
pub struct ZeroTrieSimpleAsciiZeroVec<'a> {
#[serde(borrow)]
trie: ZeroTrieSimpleAscii<ZeroVec<'a, u8>>,
}
#[test]
pub fn test_serde_simpleascii_zerovec() {
let trie =
ZeroTrieSimpleAscii::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII));
let original = ZeroTrieSimpleAsciiZeroVec { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[0, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTrieSimpleAsciiZeroVec = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTrieSimpleAsciiZeroVec =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(json_recovered.trie.into_store().is_owned());
assert!(!bincode_recovered.trie.into_store().is_owned());
}
#[derive(Serialize, Deserialize)]
pub struct ZeroTriePerfectHashZeroVec<'a> {
#[serde(borrow)]
trie: ZeroTriePerfectHash<ZeroVec<'a, u8>>,
}
#[test]
pub fn test_serde_perfecthash_zerovec() {
let trie =
ZeroTriePerfectHash::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII));
let original = ZeroTriePerfectHashZeroVec { trie };
let json_str = serde_json::to_string(&original).unwrap();
let bincode_bytes = bincode::serialize(&original).unwrap();
assert_eq!(json_str, testdata::basic::JSON_STR_ASCII);
assert_eq!(&bincode_bytes[0..9], &[3, 26, 0, 0, 0, 0, 0, 0, 0]);
assert_eq!(&bincode_bytes[9..], testdata::basic::BINCODE_BYTES_ASCII);
let json_recovered: ZeroTriePerfectHashZeroVec = serde_json::from_str(&json_str).unwrap();
let bincode_recovered: ZeroTriePerfectHashZeroVec =
bincode::deserialize(&bincode_bytes).unwrap();
assert_eq!(original.trie, json_recovered.trie);
assert_eq!(original.trie, bincode_recovered.trie);
assert!(json_recovered.trie.into_store().is_owned());
assert!(!bincode_recovered.trie.into_store().is_owned());
}
}

520
vendor/zerotrie/src/varint.rs vendored Normal file
View File

@@ -0,0 +1,520 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Varint spec for ZeroTrie:
//!
//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
//! - Trail bytes: top bit is varint extender; rest are low bits of value
//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
//!
//! This is best shown by examples.
//!
//! ```txt
//! xxx0'1010 = 10
//! xxx0'1111 = 15 (largest single-byte value with M=3)
//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
//! xxx1'0000 0000'0001 = 17
//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
//! xxx1'0000 1000'0000 0000'0001 = 2065
//! ```
//!
//! The latent values by number of bytes for M=3 are:
//!
//! - 1 byte: 0
//! - 2 bytes: 16 = 0x10 = 0b10000
//! - 3 bytes: 2064 = 0x810 = 0b100000010000
//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
//! - …
//!
//! For M=2, the latent values are:
//!
//! - 1 byte: 0
//! - 2 bytes: 32 = 0x20 = 0b100000
//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
//! - …
use crate::builder::konst::ConstArrayBuilder;
#[cfg(feature = "alloc")]
use crate::builder::nonconst::TrieBuilderStore;
/// Reads a varint with 2 bits of metadata in the lead byte.
///
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
///
/// If the varint spills off the end of the slice, a debug assertion will fail,
/// and the function will return the value up to that point.
pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
let mut value = (start & 0b00011111) as usize;
let mut remainder = remainder;
if (start & 0b00100000) != 0 {
loop {
let next;
(next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
if (*next & 0b10000000) == 0 {
break;
}
}
}
(value, remainder)
}
/// Reads a varint with 3 bits of metadata in the lead byte.
///
/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
///
/// If the varint spills off the end of the slice, a debug assertion will fail,
/// and the function will return the value up to that point.
pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
let mut value = (start & 0b00001111) as usize;
let mut remainder = remainder;
if (start & 0b00010000) != 0 {
loop {
let next;
(next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
if (*next & 0b10000000) == 0 {
break;
}
}
}
(value, remainder)
}
/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
///
/// Returns the varint value.
#[cfg(feature = "alloc")]
pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
start: u8,
remainder: &mut S,
) -> Option<usize> {
let mut value = (start & 0b00001111) as usize;
if (start & 0b00010000) != 0 {
loop {
let next = remainder.atbs_pop_front()?;
// Note: value << 7 could drop high bits. The first addition can't overflow.
// The second addition could overflow; in such a case we just inform the
// developer via the debug assertion.
value = (value << 7) + ((next & 0b01111111) as usize) + 16;
if (next & 0b10000000) == 0 {
break;
}
}
}
Some(value)
}
#[cfg(test)]
const MAX_VARINT: usize = usize::MAX;
// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
// Add an extra 1 since the lead byte holds only 5 bits of data.
const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
let mut i = MAX_VARINT_LENGTH - 1;
let mut value = value;
let mut last = true;
loop {
if value < 32 {
result[i] = value as u8;
if !last {
result[i] |= 0b00100000;
}
break;
}
value -= 32;
result[i] = (value as u8) & 0b01111111;
if !last {
result[i] |= 0b10000000;
} else {
last = false;
}
value >>= 7;
i -= 1;
}
// The bytes are from i to the end.
ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
}
/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
#[allow(clippy::indexing_slicing)] // Okay so long as MAX_VARINT_LENGTH is correct
pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
let mut i = MAX_VARINT_LENGTH - 1;
let mut value = value;
let mut last = true;
loop {
if value < 16 {
result[i] = value as u8;
if !last {
result[i] |= 0b00010000;
}
break;
}
value -= 16;
result[i] = (value as u8) & 0b01111111;
if !last {
result[i] |= 0b10000000;
} else {
last = false;
}
value >>= 7;
i -= 1;
}
// The bytes are from i to the end.
ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
}
/// A secondary implementation that separates the latent value while computing the varint.
#[cfg(test)]
pub(crate) const fn write_varint_reference(
value: usize,
) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
let mut result = [0; MAX_VARINT_LENGTH];
if value < 32 {
result[0] = value as u8;
return ConstArrayBuilder::from_manual_slice(result, 0, 1);
}
result[0] = 32;
let mut latent = 32;
let mut steps = 2;
loop {
let next_latent = (latent << 7) + 32;
if value < next_latent || next_latent == latent {
break;
}
latent = next_latent;
steps += 1;
}
let mut value = value - latent;
let mut i = steps;
while i > 0 {
i -= 1;
result[i] |= (value as u8) & 0b01111111;
value >>= 7;
if i > 0 && i < steps - 1 {
result[i] |= 0b10000000;
}
}
// The bytes are from 0 to `steps`.
ConstArrayBuilder::from_manual_slice(result, 0, steps)
}
#[cfg(test)]
mod tests {
use super::*;
#[derive(Debug)]
struct TestCase<'a> {
bytes: &'a [u8],
remainder: &'a [u8],
value: usize,
}
static CASES: &[TestCase] = &[
TestCase {
bytes: &[0b00000000],
remainder: &[],
value: 0,
},
TestCase {
bytes: &[0b00001010],
remainder: &[],
value: 10,
},
TestCase {
bytes: &[0b00011111],
remainder: &[],
value: 31,
},
TestCase {
bytes: &[0b00011111, 0b10101010],
remainder: &[0b10101010],
value: 31,
},
TestCase {
bytes: &[0b00100000, 0b00000000],
remainder: &[],
value: 32,
},
TestCase {
bytes: &[0b00100000, 0b00000001],
remainder: &[],
value: 33,
},
TestCase {
bytes: &[0b00100000, 0b00100000],
remainder: &[],
value: 64,
},
TestCase {
bytes: &[0x20, 0x44],
remainder: &[],
value: 100,
},
TestCase {
bytes: &[0b00100000, 0b01111111],
remainder: &[],
value: 159,
},
TestCase {
bytes: &[0b00100001, 0b00000000],
remainder: &[],
value: 160,
},
TestCase {
bytes: &[0b00100001, 0b00000001],
remainder: &[],
value: 161,
},
TestCase {
bytes: &[0x23, 0x54],
remainder: &[],
value: 500,
},
TestCase {
bytes: &[0b00111111, 0b01111111],
remainder: &[],
value: 4127, // 32 + (1 << 12) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b00000000],
remainder: &[],
value: 4128, // 32 + (1 << 12)
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b00000001],
remainder: &[],
value: 4129, // 32 + (1 << 12) + 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b01111111],
remainder: &[],
value: 4255, // 32 + (1 << 12) + 127
},
TestCase {
bytes: &[0b00100000, 0b10000001, 0b00000000],
remainder: &[],
value: 4256, // 32 + (1 << 12) + 128
},
TestCase {
bytes: &[0b00100000, 0b10000001, 0b00000001],
remainder: &[],
value: 4257, // 32 + (1 << 12) + 129
},
TestCase {
bytes: &[0x20, 0x86, 0x68],
remainder: &[],
value: 5000,
},
TestCase {
bytes: &[0b00100000, 0b11111111, 0b01111111],
remainder: &[],
value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
},
TestCase {
bytes: &[0b00100001, 0b10000000, 0b00000000],
remainder: &[],
value: 20512, // 32 + (1 << 12) + (1 << 14)
},
TestCase {
bytes: &[0b00111111, 0b11111111, 0b01111111],
remainder: &[],
value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
remainder: &[],
value: 528416, // 32 + (1 << 12) + (1 << 19)
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
remainder: &[],
value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
},
TestCase {
bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
remainder: &[],
value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
},
TestCase {
bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
remainder: &[],
value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
},
];
#[test]
fn test_read() {
for cas in CASES {
let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
}
}
#[test]
fn test_read_write() {
for cas in CASES {
let reference_bytes = write_varint_reference(cas.value);
assert_eq!(
reference_bytes.len(),
cas.bytes.len() - cas.remainder.len(),
"{cas:?}"
);
assert_eq!(
reference_bytes.as_slice(),
&cas.bytes[0..reference_bytes.len()],
"{cas:?}"
);
let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
assert_eq!(recovered, (cas.value, cas.remainder), "{cas:?}");
let write_bytes = write_varint_meta2(cas.value);
assert_eq!(
reference_bytes.as_slice(),
write_bytes.as_slice(),
"{cas:?}"
);
}
}
#[test]
fn test_roundtrip() {
let mut i = 0usize;
while i < MAX_VARINT {
let bytes = write_varint_meta2(i);
let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
i <<= 1;
i += 1;
}
}
#[test]
fn test_extended_roundtrip() {
let mut i = 0usize;
while i < MAX_VARINT {
let bytes = write_varint_meta3(i);
let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
i <<= 1;
i += 1;
}
}
#[test]
fn test_max() {
let reference_bytes = write_varint_reference(MAX_VARINT);
let write_bytes = write_varint_meta2(MAX_VARINT);
assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
let subarray = write_bytes
.as_const_slice()
.get_subslice_or_panic(1, write_bytes.len());
let (recovered_value, remainder) = read_varint_meta2(
*write_bytes.as_const_slice().first().unwrap(),
subarray.as_slice(),
);
assert!(remainder.is_empty());
assert_eq!(recovered_value, MAX_VARINT);
#[cfg(target_pointer_width = "64")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00100001, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b01011111, //
]
);
#[cfg(target_pointer_width = "32")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00101111, //
0b11011111, //
0b11011111, //
0b11011111, //
0b01011111, //
]
);
}
#[test]
fn text_extended_max() {
let write_bytes = write_varint_meta3(MAX_VARINT);
assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
assert!(remainder.is_empty());
assert_eq!(recovered_value, MAX_VARINT);
#[cfg(target_pointer_width = "64")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00010001, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b01101111, //
]
);
#[cfg(target_pointer_width = "32")]
assert_eq!(
write_bytes.as_slice(),
&[
0b00011111, //
0b11101111, //
0b11101111, //
0b11101111, //
0b01101111, //
]
);
}
#[test]
fn test_latent_values() {
// Same values documented in the module docs: M=2
let m2 = read_varint_meta2;
assert_eq!(m2(0, &[]).0, 0);
assert_eq!(m2(0x20, &[0x00]).0, 32);
assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
// Same values documented in the module docs: M=3
let m3 = read_varint_meta3;
assert_eq!(m3(0, &[]).0, 0);
assert_eq!(m3(0x10, &[0x00]).0, 16);
assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
}
}

888
vendor/zerotrie/src/zerotrie.rs vendored Normal file
View File

@@ -0,0 +1,888 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use crate::reader;
use core::borrow::Borrow;
#[cfg(feature = "alloc")]
use crate::{
builder::bytestr::ByteStr, builder::nonconst::ZeroTrieBuilder, error::ZeroTrieBuildError,
};
#[cfg(feature = "alloc")]
use alloc::{boxed::Box, collections::BTreeMap, collections::VecDeque, string::String, vec::Vec};
#[cfg(feature = "litemap")]
use litemap::LiteMap;
/// A data structure that compactly maps from byte sequences to integers.
///
/// There are several variants of `ZeroTrie` which are very similar but are optimized
/// for different use cases:
///
/// - [`ZeroTrieSimpleAscii`] is the most compact structure. Very fast for small data.
/// Only stores ASCII-encoded strings. Can be const-constructed!
/// - [`ZeroTriePerfectHash`] is also compact, but it also supports arbitrary binary
/// strings. It also scales better to large data. Cannot be const-constructed.
/// - [`ZeroTrieExtendedCapacity`] can be used if more than 2^32 bytes are required.
///
/// You can create a `ZeroTrie` directly, in which case the most appropriate
/// backing implementation will be chosen.
///
/// # Backing Store
///
/// The data structure has a flexible backing data store. The only requirement for most
/// functionality is that it implement `AsRef<[u8]>`. All of the following are valid
/// ZeroTrie types:
///
/// - `ZeroTrie<[u8]>` (dynamically sized type: must be stored in a reference or Box)
/// - `ZeroTrie<&[u8]>` (borrows its data from a u8 buffer)
/// - `ZeroTrie<Vec<u8>>` (fully owned data)
/// - `ZeroTrie<ZeroVec<u8>>` (the recommended borrowed-or-owned signature)
/// - `Cow<ZeroTrie<[u8]>>` (another borrowed-or-owned signature)
/// - `ZeroTrie<Cow<[u8]>>` (another borrowed-or-owned signature)
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTrie;
///
/// let mut map = LiteMap::<&[u8], usize>::new_vec();
/// map.insert("foo".as_bytes(), 1);
/// map.insert("bar".as_bytes(), 2);
/// map.insert("bazzoo".as_bytes(), 3);
///
/// let trie = ZeroTrie::try_from(&map)?;
///
/// assert_eq!(trie.get("foo"), Some(1));
/// assert_eq!(trie.get("bar"), Some(2));
/// assert_eq!(trie.get("bazzoo"), Some(3));
/// assert_eq!(trie.get("unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
// Note: The absence of the following derive does not cause any test failures in this crate
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
pub struct ZeroTrie<Store>(pub(crate) ZeroTrieFlavor<Store>);
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum ZeroTrieFlavor<Store> {
SimpleAscii(ZeroTrieSimpleAscii<Store>),
PerfectHash(ZeroTriePerfectHash<Store>),
ExtendedCapacity(ZeroTrieExtendedCapacity<Store>),
}
/// A data structure that compactly maps from ASCII strings to integers.
///
/// For more information, see [`ZeroTrie`].
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"foo"[..], 1);
/// map.insert(b"bar", 2);
/// map.insert(b"bazzoo", 3);
///
/// let trie = ZeroTrieSimpleAscii::try_from(&map)?;
///
/// assert_eq!(trie.get(b"foo"), Some(1));
/// assert_eq!(trie.get(b"bar"), Some(2));
/// assert_eq!(trie.get(b"bazzoo"), Some(3));
/// assert_eq!(trie.get(b"unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
///
/// The trie can only store ASCII bytes; a string with non-ASCII always returns None:
///
/// ```
/// use zerotrie::ZeroTrieSimpleAscii;
///
/// // A trie with two values: "abc" and "abcdef"
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81");
///
/// assert!(trie.get(b"ab\xFF").is_none());
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTrieSimpleAscii<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTrieSimpleAscii<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTrieSimpleAscii<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::SimpleAscii(self))
}
}
/// A data structure that compactly maps from ASCII strings to integers
/// in a case-insensitive way.
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"foo"[..], 1);
/// map.insert(b"Bar", 2);
/// map.insert(b"Bazzoo", 3);
///
/// let trie = ZeroAsciiIgnoreCaseTrie::try_from(&map)?;
///
/// assert_eq!(trie.get(b"foo"), Some(1));
/// assert_eq!(trie.get(b"bar"), Some(2));
/// assert_eq!(trie.get(b"BAR"), Some(2));
/// assert_eq!(trie.get(b"bazzoo"), Some(3));
/// assert_eq!(trie.get(b"unknown"), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
///
/// Strings with different cases of the same character at the same offset are not allowed:
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroAsciiIgnoreCaseTrie;
///
/// let mut map = LiteMap::new_vec();
/// map.insert(&b"bar"[..], 1);
/// // OK: 'r' and 'Z' are different letters
/// map.insert(b"baZ", 2);
/// // Bad: we already inserted 'r' so we cannot also insert 'R' at the same position
/// map.insert(b"baR", 2);
///
/// ZeroAsciiIgnoreCaseTrie::try_from(&map).expect_err("mixed-case strings!");
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroAsciiIgnoreCaseTrie<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroAsciiIgnoreCaseTrie<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
// Note: ZeroAsciiIgnoreCaseTrie is not a variant of ZeroTrie so there is no `into_zerotrie`
/// A data structure that compactly maps from byte strings to integers.
///
/// For more information, see [`ZeroTrie`].
///
/// # Examples
///
/// ```
/// use litemap::LiteMap;
/// use zerotrie::ZeroTriePerfectHash;
///
/// let mut map = LiteMap::<&[u8], usize>::new_vec();
/// map.insert("foo".as_bytes(), 1);
/// map.insert("bår".as_bytes(), 2);
/// map.insert("båzzøø".as_bytes(), 3);
///
/// let trie = ZeroTriePerfectHash::try_from(&map)?;
///
/// assert_eq!(trie.get("foo".as_bytes()), Some(1));
/// assert_eq!(trie.get("bår".as_bytes()), Some(2));
/// assert_eq!(trie.get("båzzøø".as_bytes()), Some(3));
/// assert_eq!(trie.get("bazzoo".as_bytes()), None);
///
/// # Ok::<_, zerotrie::ZeroTrieBuildError>(())
/// ```
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTriePerfectHash<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTriePerfectHash<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTriePerfectHash<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::PerfectHash(self))
}
}
/// A data structure that maps from a large number of byte strings to integers.
///
/// For more information, see [`ZeroTrie`].
#[repr(transparent)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = zerotrie))]
#[allow(clippy::exhaustive_structs)] // databake hidden fields
pub struct ZeroTrieExtendedCapacity<Store: ?Sized> {
#[doc(hidden)] // for databake, but there are no invariants
pub store: Store,
}
impl<Store: ?Sized> ZeroTrieExtendedCapacity<Store> {
fn transparent_ref_from_store(s: &Store) -> &Self {
unsafe {
// Safety: Self is transparent over Store
core::mem::transmute(s)
}
}
}
impl<Store> ZeroTrieExtendedCapacity<Store> {
/// Wrap this specific ZeroTrie variant into a ZeroTrie.
#[inline]
pub const fn into_zerotrie(self) -> ZeroTrie<Store> {
ZeroTrie(ZeroTrieFlavor::ExtendedCapacity(self))
}
}
macro_rules! impl_zerotrie_subtype {
($name:ident, $iter_element:ty, $iter_fn:path, $iter_ty:ty, $cnv_fn:path) => {
impl<Store> $name<Store> {
/// Create a trie directly from a store.
///
/// If the store does not contain valid bytes, unexpected behavior may occur.
#[inline]
pub const fn from_store(store: Store) -> Self {
Self { store }
}
/// Takes the byte store from this trie.
#[inline]
pub fn into_store(self) -> Store {
self.store
}
/// Converts this trie's store to a different store implementing the `From` trait.
///
#[doc = concat!("For example, use this to change `", stringify!($name), "<Vec<u8>>` to `", stringify!($name), "<Cow<[u8]>>`.")]
///
/// # Examples
///
/// ```
/// use std::borrow::Cow;
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: ", stringify!($name), "<Vec<u8>> = ", stringify!($name), "::from_bytes(b\"abc\\x85\").to_owned();")]
#[doc = concat!("let cow: ", stringify!($name), "<Cow<[u8]>> = trie.convert_store();")]
///
/// assert_eq!(cow.get(b"abc"), Some(5));
/// ```
pub fn convert_store<X: From<Store>>(self) -> $name<X> {
$name::<X>::from_store(X::from(self.store))
}
}
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Queries the trie for a string.
// Note: We do not need the Borrow trait's guarantees, so we use
// the more general AsRef trait.
pub fn get<K>(&self, key: K) -> Option<usize> where K: AsRef<[u8]> {
reader::get_parameterized::<Self>(self.store.as_ref(), key.as_ref())
}
/// Returns `true` if the trie is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.store.as_ref().is_empty()
}
/// Returns the size of the trie in number of bytes.
///
/// To get the number of keys in the trie, use `.iter().count()`:
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
/// // A trie with two values: "abc" and "abcdef"
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x80def\\x81\");")]
///
/// assert_eq!(8, trie.byte_len());
/// assert_eq!(2, trie.iter().count());
/// ```
#[inline]
pub fn byte_len(&self) -> usize {
self.store.as_ref().len()
}
/// Returns the bytes contained in the underlying store.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.store.as_ref()
}
/// Returns this trie as a reference transparent over a byte slice.
#[inline]
pub fn as_borrowed(&self) -> &$name<[u8]> {
$name::from_bytes(self.store.as_ref())
}
/// Returns a trie with a store borrowing from this trie.
#[inline]
pub fn as_borrowed_slice(&self) -> $name<&[u8]> {
$name::from_store(self.store.as_ref())
}
}
impl<Store> AsRef<$name<[u8]>> for $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn as_ref(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
#[cfg(feature = "alloc")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Converts a possibly-borrowed $name to an owned one.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x85\");")]
#[doc = concat!("let owned: ", stringify!($name), "<Vec<u8>> = trie.to_owned();")]
///
/// assert_eq!(trie.get(b"abc"), Some(5));
/// assert_eq!(owned.get(b"abc"), Some(5));
/// ```
#[inline]
pub fn to_owned(&self) -> $name<Vec<u8>> {
$name::from_store(
Vec::from(self.store.as_ref()),
)
}
/// Returns an iterator over the key/value pairs in this trie.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
/// // A trie with two values: "abc" and "abcdef"
#[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x80def\\x81\");")]
///
/// let mut it = trie.iter();
/// assert_eq!(it.next(), Some(("abc".into(), 0)));
/// assert_eq!(it.next(), Some(("abcdef".into(), 1)));
/// assert_eq!(it.next(), None);
/// ```
#[inline]
pub fn iter(&self) -> $iter_ty {
$iter_fn(self.as_bytes())
}
}
impl $name<[u8]> {
/// Casts from a byte slice to a reference to a trie with the same lifetime.
///
/// If the bytes are not a valid trie, unexpected behavior may occur.
#[inline]
pub fn from_bytes(trie: &[u8]) -> &Self {
Self::transparent_ref_from_store(trie)
}
}
#[cfg(feature = "alloc")]
impl $name<Vec<u8>> {
pub(crate) fn try_from_tuple_slice(items: &[(&ByteStr, usize)]) -> Result<Self, ZeroTrieBuildError> {
use crate::options::ZeroTrieWithOptions;
ZeroTrieBuilder::<VecDeque<u8>>::from_sorted_tuple_slice(
items,
Self::OPTIONS,
)
.map(|s| Self {
store: s.to_bytes(),
})
}
}
#[cfg(feature = "alloc")]
impl<'a, K> FromIterator<(K, usize)> for $name<Vec<u8>>
where
K: AsRef<[u8]>
{
fn from_iter<T: IntoIterator<Item = (K, usize)>>(iter: T) -> Self {
use crate::options::ZeroTrieWithOptions;
use crate::builder::nonconst::ZeroTrieBuilder;
ZeroTrieBuilder::<VecDeque<u8>>::from_bytes_iter(
iter,
Self::OPTIONS
)
.map(|s| Self {
store: s.to_bytes(),
})
.unwrap()
}
}
#[cfg(feature = "alloc")]
impl<'a, K> TryFrom<&'a BTreeMap<K, usize>> for $name<Vec<u8>>
where
K: Borrow<[u8]>
{
type Error = crate::error::ZeroTrieBuildError;
fn try_from(map: &'a BTreeMap<K, usize>) -> Result<Self, Self::Error> {
let tuples: Vec<(&[u8], usize)> = map
.iter()
.map(|(k, v)| (k.borrow(), *v))
.collect();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples);
Self::try_from_tuple_slice(byte_str_slice)
}
}
#[cfg(feature = "alloc")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized
{
/// Exports the data from this ZeroTrie type into a BTreeMap.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
/// use std::collections::BTreeMap;
///
#[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")]
/// let items = trie.to_btreemap();
///
/// assert_eq!(items.len(), 2);
///
#[doc = concat!("let recovered_trie: ", stringify!($name), "<Vec<u8>> = items")]
/// .into_iter()
/// .collect();
/// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes());
/// ```
pub fn to_btreemap(&self) -> BTreeMap<$iter_element, usize> {
self.iter().collect()
}
#[allow(dead_code)] // not needed for ZeroAsciiIgnoreCaseTrie
pub(crate) fn to_btreemap_bytes(&self) -> BTreeMap<Box<[u8]>, usize> {
self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect()
}
}
#[cfg(feature = "alloc")]
impl<Store> From<&$name<Store>> for BTreeMap<$iter_element, usize>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn from(other: &$name<Store>) -> Self {
other.to_btreemap()
}
}
#[cfg(feature = "litemap")]
impl<'a, K, S> TryFrom<&'a LiteMap<K, usize, S>> for $name<Vec<u8>>
where
K: Borrow<[u8]>,
S: litemap::store::StoreIterable<'a, K, usize>,
{
type Error = crate::error::ZeroTrieBuildError;
fn try_from(map: &'a LiteMap<K, usize, S>) -> Result<Self, Self::Error> {
let tuples: Vec<(&[u8], usize)> = map
.iter()
.map(|(k, v)| (k.borrow(), *v))
.collect();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples);
Self::try_from_tuple_slice(byte_str_slice)
}
}
#[cfg(feature = "litemap")]
impl<Store> $name<Store>
where
Store: AsRef<[u8]> + ?Sized,
{
/// Exports the data from this ZeroTrie type into a LiteMap.
///
/// ✨ *Enabled with the `litemap` Cargo feature.*
///
/// # Examples
///
/// ```
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
/// use litemap::LiteMap;
///
#[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")]
///
/// let items = trie.to_litemap();
/// assert_eq!(items.len(), 2);
///
#[doc = concat!("let recovered_trie: ", stringify!($name), "<Vec<u8>> = items")]
/// .iter()
/// .map(|(k, v)| (k, *v))
/// .collect();
/// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes());
/// ```
pub fn to_litemap(&self) -> LiteMap<$iter_element, usize> {
self.iter().collect()
}
#[allow(dead_code)] // not needed for ZeroAsciiIgnoreCaseTrie
pub(crate) fn to_litemap_bytes(&self) -> LiteMap<Box<[u8]>, usize> {
self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect()
}
}
#[cfg(feature = "litemap")]
impl<Store> From<&$name<Store>> for LiteMap<$iter_element, usize>
where
Store: AsRef<[u8]> + ?Sized,
{
#[inline]
fn from(other: &$name<Store>) -> Self {
other.to_litemap()
}
}
#[cfg(feature = "litemap")]
impl $name<Vec<u8>>
{
#[cfg(feature = "serde")]
pub(crate) fn try_from_serde_litemap(items: &LiteMap<Box<ByteStr>, usize>) -> Result<Self, ZeroTrieBuildError> {
let lm_borrowed: LiteMap<&ByteStr, usize> = items.to_borrowed_keys();
Self::try_from_tuple_slice(lm_borrowed.as_slice())
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
impl Borrow<$name<[u8]>> for $name<&[u8]> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
#[cfg(feature = "alloc")]
impl Borrow<$name<[u8]>> for $name<Box<[u8]>> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
// Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl.
#[cfg(feature = "alloc")]
impl Borrow<$name<[u8]>> for $name<Vec<u8>> {
#[inline]
fn borrow(&self) -> &$name<[u8]> {
self.as_borrowed()
}
}
#[cfg(feature = "alloc")]
impl alloc::borrow::ToOwned for $name<[u8]> {
type Owned = $name<Box<[u8]>>;
#[doc = concat!("This impl allows [`", stringify!($name), "`] to be used inside of a [`Cow`](alloc::borrow::Cow).")]
///
#[doc = concat!("Note that it is also possible to use `", stringify!($name), "<ZeroVec<u8>>` for a similar result.")]
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
///
/// # Examples
///
/// ```
/// use std::borrow::Cow;
#[doc = concat!("use zerotrie::", stringify!($name), ";")]
///
#[doc = concat!("let trie: Cow<", stringify!($name), "<[u8]>> = Cow::Borrowed(", stringify!($name), "::from_bytes(b\"abc\\x85\"));")]
/// assert_eq!(trie.get(b"abc"), Some(5));
/// ```
fn to_owned(&self) -> Self::Owned {
let bytes: &[u8] = self.store.as_ref();
$name::from_store(
Vec::from(bytes).into_boxed_slice(),
)
}
}
// TODO(#2778): Auto-derive these impls based on the repr(transparent).
//
// Safety (based on the safety checklist on the VarULE trait):
// 1. `$name` does not include any uninitialized or padding bytes as it is `repr(transparent)`
// over a `VarULE` type, `Store`, as evidenced by the existence of `transparent_ref_from_store()`
// 2. `$name` is aligned to 1 byte for the same reason
// 3. The impl of `validate_bytes()` returns an error if any byte is not valid (passed down to `VarULE` impl of `Store`)
// 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (passed down to `VarULE` impl of `Store`)
// 5. The impl of `from_bytes_unchecked()` returns a reference to the same data.
// 6. `parse_bytes()` is left to its default impl
// 7. byte equality is semantic equality
#[cfg(feature = "zerovec")]
unsafe impl<Store> zerovec::ule::VarULE for $name<Store>
where
Store: zerovec::ule::VarULE,
{
#[inline]
fn validate_bytes(bytes: &[u8]) -> Result<(), zerovec::ule::UleError> {
Store::validate_bytes(bytes)
}
#[inline]
unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
// Safety: we can pass down the validity invariant to Store
Self::transparent_ref_from_store(Store::from_bytes_unchecked(bytes))
}
}
#[cfg(feature = "zerofrom")]
impl<'zf, Store1, Store2> zerofrom::ZeroFrom<'zf, $name<Store1>> for $name<Store2>
where
Store2: zerofrom::ZeroFrom<'zf, Store1>,
{
#[inline]
fn zero_from(other: &'zf $name<Store1>) -> Self {
$name::from_store(zerofrom::ZeroFrom::zero_from(&other.store))
}
}
};
}
#[cfg(feature = "alloc")]
fn string_to_box_u8(input: String) -> Box<[u8]> {
input.into_boxed_str().into_boxed_bytes()
}
#[doc(hidden)] // subject to change
#[cfg(feature = "alloc")]
pub type ZeroTrieStringIterator<'a> =
core::iter::Map<reader::ZeroTrieIterator<'a>, fn((Vec<u8>, usize)) -> (String, usize)>;
impl_zerotrie_subtype!(
ZeroTrieSimpleAscii,
String,
reader::get_iter_ascii_or_panic,
ZeroTrieStringIterator<'_>,
string_to_box_u8
);
impl_zerotrie_subtype!(
ZeroAsciiIgnoreCaseTrie,
String,
reader::get_iter_ascii_or_panic,
ZeroTrieStringIterator<'_>,
string_to_box_u8
);
impl_zerotrie_subtype!(
ZeroTriePerfectHash,
Vec<u8>,
reader::get_iter_phf,
reader::ZeroTrieIterator<'_>,
Vec::into_boxed_slice
);
impl_zerotrie_subtype!(
ZeroTrieExtendedCapacity,
Vec<u8>,
reader::get_iter_phf,
reader::ZeroTrieIterator<'_>,
Vec::into_boxed_slice
);
macro_rules! impl_dispatch {
($self:ident, $inner_fn:ident()) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(),
}
};
($self:ident, $inner_fn:ident().into_zerotrie()) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn().into_zerotrie(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn().into_zerotrie(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn().into_zerotrie(),
}
};
(&$self:ident, $inner_fn:ident()) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(),
}
};
($self:ident, $inner_fn:ident($arg:ident)) => {
match $self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg),
}
};
(&$self:ident, $inner_fn:ident($arg:ident)) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg),
ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg),
}
};
(&$self:ident, $trait:ident::$inner_fn:ident()) => {
match &$self.0 {
ZeroTrieFlavor::SimpleAscii(subtype) => {
ZeroTrie(ZeroTrieFlavor::SimpleAscii($trait::$inner_fn(subtype)))
}
ZeroTrieFlavor::PerfectHash(subtype) => {
ZeroTrie(ZeroTrieFlavor::PerfectHash($trait::$inner_fn(subtype)))
}
ZeroTrieFlavor::ExtendedCapacity(subtype) => {
ZeroTrie(ZeroTrieFlavor::ExtendedCapacity($trait::$inner_fn(subtype)))
}
}
};
}
impl<Store> ZeroTrie<Store> {
/// Takes the byte store from this trie.
pub fn into_store(self) -> Store {
impl_dispatch!(self, into_store())
}
/// Converts this trie's store to a different store implementing the `From` trait.
///
/// For example, use this to change `ZeroTrie<Vec<u8>>` to `ZeroTrie<Cow<[u8]>>`.
pub fn convert_store<NewStore>(self) -> ZeroTrie<NewStore>
where
NewStore: From<Store>,
{
impl_dispatch!(self, convert_store().into_zerotrie())
}
}
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Queries the trie for a string.
pub fn get<K>(&self, key: K) -> Option<usize>
where
K: AsRef<[u8]>,
{
impl_dispatch!(&self, get(key))
}
/// Returns `true` if the trie is empty.
pub fn is_empty(&self) -> bool {
impl_dispatch!(&self, is_empty())
}
/// Returns the size of the trie in number of bytes.
///
/// To get the number of keys in the trie, use `.iter().count()`.
pub fn byte_len(&self) -> usize {
impl_dispatch!(&self, byte_len())
}
}
#[cfg(feature = "alloc")]
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Exports the data from this ZeroTrie into a BTreeMap.
///
/// ✨ *Enabled with the `alloc` Cargo feature.*
pub fn to_btreemap(&self) -> BTreeMap<Box<[u8]>, usize> {
impl_dispatch!(&self, to_btreemap_bytes())
}
}
#[cfg(feature = "litemap")]
impl<Store> ZeroTrie<Store>
where
Store: AsRef<[u8]>,
{
/// Exports the data from this ZeroTrie into a LiteMap.
pub fn to_litemap(&self) -> LiteMap<Box<[u8]>, usize> {
impl_dispatch!(&self, to_litemap_bytes())
}
}
#[cfg(feature = "alloc")]
impl ZeroTrie<Vec<u8>> {
pub(crate) fn try_from_tuple_slice(
items: &[(&ByteStr, usize)],
) -> Result<Self, ZeroTrieBuildError> {
let is_all_ascii = items.iter().all(|(s, _)| s.is_all_ascii());
if is_all_ascii && items.len() < 512 {
ZeroTrieSimpleAscii::try_from_tuple_slice(items).map(|x| x.into_zerotrie())
} else {
ZeroTriePerfectHash::try_from_tuple_slice(items).map(|x| x.into_zerotrie())
}
}
}
#[cfg(feature = "alloc")]
impl<K> FromIterator<(K, usize)> for ZeroTrie<Vec<u8>>
where
K: AsRef<[u8]>,
{
fn from_iter<T: IntoIterator<Item = (K, usize)>>(iter: T) -> Self {
// We need two Vecs because the first one anchors the `K`s that the second one borrows.
let items = Vec::from_iter(iter);
let mut items: Vec<(&[u8], usize)> = items.iter().map(|(k, v)| (k.as_ref(), *v)).collect();
items.sort();
let byte_str_slice = ByteStr::from_byte_slice_with_value(&items);
#[expect(clippy::unwrap_used)] // FromIterator is panicky
Self::try_from_tuple_slice(byte_str_slice).unwrap()
}
}
#[cfg(feature = "databake")]
impl<Store> databake::Bake for ZeroTrie<Store>
where
Store: databake::Bake,
{
fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
use databake::*;
let inner = impl_dispatch!(&self, bake(env));
quote! { #inner.into_zerotrie() }
}
}
#[cfg(feature = "databake")]
impl<Store> databake::BakeSize for ZeroTrie<Store>
where
Store: databake::BakeSize,
{
fn borrows_size(&self) -> usize {
impl_dispatch!(&self, borrows_size())
}
}
#[cfg(feature = "zerofrom")]
impl<'zf, Store1, Store2> zerofrom::ZeroFrom<'zf, ZeroTrie<Store1>> for ZeroTrie<Store2>
where
Store2: zerofrom::ZeroFrom<'zf, Store1>,
{
fn zero_from(other: &'zf ZeroTrie<Store1>) -> Self {
use zerofrom::ZeroFrom;
impl_dispatch!(&other, ZeroFrom::zero_from())
}
}

73
vendor/zerotrie/tests/asciitrie_test.rs vendored Normal file
View File

@@ -0,0 +1,73 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use postcard::ser_flavors::{AllocVec, Flavor};
use serde::Serialize;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroMap;
mod testdata {
include!("data/data.rs");
}
#[test]
fn test_basic() {
let bytes_ascii = testdata::basic::TRIE_ASCII;
let data_ascii = testdata::basic::DATA_ASCII;
let trie_ascii = ZeroTrieSimpleAscii::from_bytes(bytes_ascii);
let trie_phf_ascii = ZeroTriePerfectHash::from_bytes(bytes_ascii);
let bytes_unicode = testdata::basic::TRIE_UNICODE;
let data_unicode = testdata::basic::DATA_UNICODE;
let trie_phf_unicode = ZeroTriePerfectHash::from_bytes(bytes_unicode);
let bytes_binary = testdata::basic::TRIE_BINARY;
let data_binary = testdata::basic::DATA_BINARY;
let trie_phf_binary = ZeroTriePerfectHash::from_bytes(bytes_binary);
// Check that the getter works
for (key, expected) in data_ascii {
let actual = match trie_ascii.get(key) {
Some(v) => v,
None => panic!("value should be in trie: {key:?} => {expected}"),
};
assert_eq!(*expected, actual);
let actual = match trie_phf_ascii.get(key) {
Some(v) => v,
None => panic!("value should be in trie6: {key:?} => {expected}"),
};
assert_eq!(*expected, actual);
}
for (key, expected) in data_unicode {
let actual_unicode = match trie_phf_unicode.get(key) {
Some(v) => v,
None => panic!("value should be in trie6: {key:?} => {expected}"),
};
assert_eq!(*expected, actual_unicode);
}
for (key, expected) in data_binary {
let actual_bin6 = match trie_phf_binary.get(key) {
Some(v) => v,
None => panic!("value should be in trie6: {key:?} => {expected}"),
};
assert_eq!(*expected, actual_bin6);
}
// Compare the size to a postcard ZeroMap
let zm: ZeroMap<[u8], u32> = data_ascii.iter().map(|(a, b)| (*a, *b as u32)).collect();
let mut serializer = postcard::Serializer {
output: AllocVec::new(),
};
Serialize::serialize(&zm, &mut serializer).unwrap();
let zeromap_bytes = serializer
.output
.finalize()
.expect("Failed to finalize serializer output");
assert_eq!(26, bytes_ascii.len());
assert_eq!(77, zeromap_bytes.len());
}

855
vendor/zerotrie/tests/builder_test.rs vendored Normal file
View File

@@ -0,0 +1,855 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use litemap::LiteMap;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
mod testdata {
include!("data/data.rs");
}
use testdata::strings_to_litemap;
const NON_EXISTENT_STRINGS: &[&str] = &[
"a9PS", "ahsY", "ahBO", "a8IN", "xk8o", "xv1l", "xI2S", "618y", "d6My", "uszy",
];
macro_rules! assert_bytes_eq {
($len:literal, $a:expr, $b:expr) => {
assert_eq!($len, $a.len());
assert_eq!($a, $b);
};
}
fn check_simple_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTrieSimpleAscii<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v));
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None);
}
// Check that the iterator returns items in the same order as the LiteMap
assert!(items
.iter()
.map(|(s, v)| (String::from_utf8(s.to_vec()).unwrap(), *v))
.eq(trie.iter()));
// Check that the const builder works
let const_trie = ZeroTrieSimpleAscii::try_from_litemap_with_const_builder(items).unwrap();
assert_eq!(trie.as_bytes(), const_trie.as_bytes());
}
fn check_phf_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v));
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None);
}
// Check that the iterator returns the contents of the LiteMap
// Note: Since the items might not be in order, we collect them into a new LiteMap
let recovered_items: LiteMap<_, _> = trie.iter().collect();
assert_eq!(
items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
recovered_items.to_borrowed_keys_values()
);
}
fn check_phf_bytes_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v), "{k:?}");
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None, "{s:?}");
}
// Check that the iterator returns the contents of the LiteMap
// Note: Since the items might not be in order, we collect them into a new LiteMap
let recovered_items: LiteMap<_, _> = trie.iter().collect();
assert_eq!(
items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
recovered_items.to_borrowed_keys_values()
);
}
#[test]
fn test_basic() {
let lm1a: LiteMap<&[u8], usize> = testdata::basic::DATA_ASCII.iter().copied().collect();
let lm1b: LiteMap<&[u8], usize> = lm1a.to_borrowed_keys();
let lm2: LiteMap<&[u8], usize> = testdata::basic::DATA_UNICODE.iter().copied().collect();
let lm3: LiteMap<&[u8], usize> = testdata::basic::DATA_BINARY.iter().copied().collect();
let expected_bytes = testdata::basic::TRIE_ASCII;
let trie = ZeroTrieSimpleAscii::try_from(&lm1a).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_simple_ascii_trie(&lm1a, &trie);
let trie = ZeroTriePerfectHash::try_from(&lm1b).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_phf_ascii_trie(&lm1a, &trie);
let expected_bytes = testdata::basic::TRIE_UNICODE;
let trie = ZeroTriePerfectHash::try_from(&lm2).unwrap();
assert_bytes_eq!(39, trie.as_bytes(), expected_bytes);
check_phf_bytes_trie(&lm2, &trie);
let expected_bytes = testdata::basic::TRIE_BINARY;
let trie = ZeroTriePerfectHash::try_from(&lm3).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_phf_bytes_trie(&lm3, &trie);
}
#[test]
fn test_empty() {
let trie = ZeroTrieSimpleAscii::try_from(&LiteMap::<&[u8], usize>::new_vec()).unwrap();
assert_eq!(trie.byte_len(), 0);
assert!(trie.is_empty());
assert_eq!(trie.get(b""), None);
assert_eq!(trie.as_bytes(), &[]);
}
#[test]
fn test_single_empty_value() {
let litemap: LiteMap<&[u8], usize> = [
(&b""[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), Some(10));
assert_eq!(trie.get(b"x"), None);
let expected_bytes = &[0b10001010];
assert_eq!(trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(1, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_byte_string() {
let litemap: LiteMap<&[u8], usize> = [
(&b"x"[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xy"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', 0b10001010];
assert_bytes_eq!(2, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(2, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_string() {
let litemap: LiteMap<&[u8], usize> = [
(&b"xyz"[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"x"), None);
assert_eq!(trie.get(b"xy"), None);
assert_eq!(trie.get(b"xyzz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', b'y', b'z', 0b10001010];
assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_prefix_strings() {
let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"xy", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xyz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', 0b10000000, b'y', 0b10000001];
assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_byte_branch() {
let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"y", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xy"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[0b11000010, b'x', b'y', 1, 0b10000000, 0b10000001];
assert_bytes_eq!(6, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(6, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_multi_byte_branch() {
let litemap: LiteMap<&[u8], usize> = [(&b"axb"[..], 0), (b"ayc", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"a"), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[
b'a', 0b11000010, b'x', b'y', 2, b'b', 0b10000000, b'c', 0b10000001,
];
assert_bytes_eq!(9, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(9, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_linear_varint_values() {
let litemap: LiteMap<&[u8], usize> = [(&b""[..], 100), (b"x", 500), (b"xyz", 5000)]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b"xy"), None);
assert_eq!(trie.get(b"xz"), None);
assert_eq!(trie.get(b"xyzz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[0x90, 0x54, b'x', 0x93, 0x64, b'y', b'z', 0x90, 0x96, 0x78];
assert_bytes_eq!(10, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(10, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_bug() {
let litemap: LiteMap<&[u8], usize> = [(&b"abc"[..], 100), (b"abcd", 500), (b"abcde", 5000)]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b"ab"), None);
assert_eq!(trie.get(b"abd"), None);
assert_eq!(trie.get(b"abCD"), None);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_varint_branch() {
let chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
let litemap: LiteMap<&[u8], usize> = (0..chars.len())
.map(|i| (chars.get(i..i + 1).unwrap().as_bytes(), i))
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100000, // branch varint lead
0x14, // branch varint trail
// search array:
b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J',
b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T',
b'U', b'V', b'W', b'X', b'Y', b'Z',
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z',
// offset array:
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20,
22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52,
54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
86,
// single-byte values:
0x80, (0x80 | 1), (0x80 | 2), (0x80 | 3), (0x80 | 4),
(0x80 | 5), (0x80 | 6), (0x80 | 7), (0x80 | 8), (0x80 | 9),
(0x80 | 10), (0x80 | 11), (0x80 | 12), (0x80 | 13), (0x80 | 14),
(0x80 | 15),
// multi-byte values:
0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x90, 10, 0x90, 11,
0x90, 12, 0x90, 13, 0x90, 14, 0x90, 15, 0x90, 16, 0x90, 17,
0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
0x90, 24, 0x90, 25, 0x90, 26, 0x90, 27, 0x90, 28, 0x90, 29,
0x90, 30, 0x90, 31, 0x90, 32, 0x90, 33, 0x90, 34, 0x90, 35,
];
assert_bytes_eq!(193, trie.as_bytes(), expected_bytes);
#[rustfmt::skip]
let expected_bytes = &[
0b11100000, // branch varint lead
0x14, // branch varint trail
// PHF metadata:
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8,
4, 4, 4, 16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 7,
// search array:
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'u', b'v', b'w', b'D', b'E', b'F', b'q',
b'r', b'A', b'B', b'C', b'x', b'y', b'z', b's',
b'H', b'I', b'J', b'G', b'P', b'Q', b'R', b'S',
b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'K',
b'L', b'M', b'N', b'O', b'g', b'a', b'b', b'c',
b't', b'd', b'f', b'e',
// offset array:
2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 25, 26, 27,
29, 31, 32, 33, 34, 36, 38, 40,
42, 43, 44, 45, 46, 47, 49, 51,
53, 55, 57, 59, 61, 63, 65, 67,
68, 69, 70, 71, 72, 74, 76, 78,
80, 82, 84, 86,
// values:
0x90, 17, 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
0x90, 24, 0x90, 25, 0x90, 30, 0x90, 31, 0x90, 32, 0x80 | 3, 0x80 | 4,
0x80 | 5, 0x90, 26, 0x90, 27, 0x80, 0x80 | 1, 0x80 | 2, 0x90, 33,
0x90, 34, 0x90, 35, 0x90, 28, 0x80 | 7, 0x80 | 8, 0x80 | 9, 0x80 | 6,
0x80 | 15, 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x80 | 10, 0x80 | 11, 0x80 | 12,
0x80 | 13, 0x80 | 14, 0x90, 16, 0x90, 10, 0x90, 11, 0x90, 12, 0x90, 29,
0x90, 13, 0x90, 15, 0x90, 14,
];
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(246, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_below_wide() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcd", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11001010, // branch
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array:
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x8A,
];
assert_bytes_eq!(275, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_at_wide() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcde", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100001, // branch lead
0x6A, // branch trail
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array (wide):
0, 0, 0, 0, 0, 0, 0, 0, 0,
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x8A,
];
assert_bytes_eq!(286, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_at_wide_plus() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcdef", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100001, // branch lead
0x6A, // branch trail
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array (wide):
0, 0, 0, 0, 0, 0, 0, 0, 0,
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x8A,
];
assert_bytes_eq!(287, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_everything() {
let litemap: LiteMap<&[u8], usize> = [
(&b""[..], 0),
(b"axb", 100),
(b"ayc", 2),
(b"azd", 3),
(b"bxe", 4),
(b"bxefg", 500),
(b"bxefh", 6),
(b"bxei", 7),
(b"bxeikl", 8),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), Some(0));
assert_eq!(trie.get(b"a"), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', //
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
];
assert_bytes_eq!(36, trie.as_bytes(), expected_bytes);
#[rustfmt::skip]
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // start of 'a' subtree: branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', // start of 'b' subtree
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
];
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(36, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 88);
let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 61);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 161);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 134);
}
macro_rules! utf8_byte {
($ch:expr, $i:literal) => {{
let mut utf8_encoder_buf = [0u8; 4];
$ch.encode_utf8(&mut utf8_encoder_buf);
utf8_encoder_buf[$i]
}};
}
#[test]
fn test_non_ascii() {
let litemap: LiteMap<&[u8], usize> = [
("".as_bytes(), 0),
("axb".as_bytes(), 100),
("ayc".as_bytes(), 2),
("azd".as_bytes(), 3),
("bxe".as_bytes(), 4),
("bxefg".as_bytes(), 500),
("bxefh".as_bytes(), 6),
("bxei".as_bytes(), 7),
("bxeikl".as_bytes(), 8),
("bxeiklmΚαλημέρααα".as_bytes(), 9),
("bxeiklmαnλo".as_bytes(), 10),
("bxeiklmη".as_bytes(), 11),
]
.into_iter()
.collect();
#[rustfmt::skip]
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // start of 'a' subtree: branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', // start of 'b' subtree
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
b'm', //
0b10100001, // span of length 1
utf8_byte!('Κ', 0), // NOTE: all three letters have the same lead byte
0b11000011, // branch of 3
utf8_byte!('Κ', 1),
utf8_byte!('α', 1),
utf8_byte!('η', 1),
21,
27,
0b10110000, // span of length 18 (lead)
0b00000010, // span of length 18 (trail)
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('λ', 0),
utf8_byte!('λ', 1),
utf8_byte!('η', 0),
utf8_byte!('η', 1),
utf8_byte!('μ', 0),
utf8_byte!('μ', 1),
utf8_byte!('έ', 0),
utf8_byte!('έ', 1),
utf8_byte!('ρ', 0),
utf8_byte!('ρ', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
0b10001001, // value 9
b'n',
0b10100010, // span of length 2
utf8_byte!('λ', 0),
utf8_byte!('λ', 1),
b'o',
0b10001010, // value 10
0b10001011, // value 11
];
let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
assert_bytes_eq!(73, trie_phf.as_bytes(), expected_bytes);
check_phf_bytes_trie(&litemap, &trie_phf);
}
#[test]
fn test_max_branch() {
// Evaluate a branch with all 256 possible children
let mut litemap: LiteMap<&[u8], usize> = LiteMap::new_vec();
let all_bytes: Vec<u8> = (u8::MIN..=u8::MAX).collect();
assert_eq!(all_bytes.len(), 256);
let all_bytes_prefixed: Vec<[u8; 2]> = (u8::MIN..=u8::MAX).map(|x| [b'\0', x]).collect();
for b in all_bytes.iter() {
litemap.insert(core::slice::from_ref(b), *b as usize);
}
for s in all_bytes_prefixed.iter() {
litemap.insert(s, s[1] as usize);
}
let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
assert_eq!(trie_phf.byte_len(), 3042);
check_phf_bytes_trie(&litemap, &trie_phf);
}
#[test]
fn test_short_subtags_10pct() {
let litemap = strings_to_litemap(testdata::short_subtags_10pct::STRINGS);
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
assert_eq!(trie.byte_len(), 1050);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_eq!(trie_phf.byte_len(), 1100);
check_phf_ascii_trie(&litemap, &trie_phf);
let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 1890);
let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 1326);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 3396);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 2832);
}
#[test]
fn test_short_subtags() {
let litemap = strings_to_litemap(testdata::short_subtags::STRINGS);
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
assert_eq!(trie.byte_len(), 8793);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_eq!(trie_phf.byte_len(), 9400);
check_phf_ascii_trie(&litemap, &trie_phf);
let zm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zm).unwrap();
assert_eq!(zhm_buf.len(), 18931);
let zm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zm).unwrap();
assert_eq!(zhm_buf.len(), 13300);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 33949);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 28318);
}

2210
vendor/zerotrie/tests/data/data.rs vendored Normal file

File diff suppressed because one or more lines are too long

138
vendor/zerotrie/tests/derive_test.rs vendored Normal file
View File

@@ -0,0 +1,138 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
#![allow(non_camel_case_types, non_snake_case)]
use zerotrie::ZeroAsciiIgnoreCaseTrie;
use zerotrie::ZeroTrie;
use zerotrie::ZeroTrieExtendedCapacity;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroVec;
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
#[cfg_attr(feature = "zerofrom", derive(zerofrom::ZeroFrom))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = crate))]
struct DeriveTest_ZeroTrie_ZeroVec<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub _data: ZeroTrie<ZeroVec<'data, u8>>,
}
#[test]
#[cfg(all(feature = "databake", feature = "alloc"))]
fn bake_ZeroTrie_ZeroVec() {
use databake::*;
extern crate std;
test_bake!(
DeriveTest_ZeroTrie_ZeroVec<'static>,
crate::DeriveTest_ZeroTrie_ZeroVec {
_data: zerotrie::ZeroTrieSimpleAscii {
store: zerovec::ZeroVec::new(),
}
.into_zerotrie()
},
);
}
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
#[cfg_attr(feature = "zerofrom", derive(zerofrom::ZeroFrom))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = crate))]
struct DeriveTest_ZeroTrieSimpleAscii_ZeroVec<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub _data: ZeroTrieSimpleAscii<ZeroVec<'data, u8>>,
}
#[test]
#[cfg(all(feature = "databake", feature = "alloc"))]
fn bake_ZeroTrieSimpleAscii_ZeroVec() {
use databake::*;
extern crate std;
test_bake!(
DeriveTest_ZeroTrieSimpleAscii_ZeroVec<'static>,
crate::DeriveTest_ZeroTrieSimpleAscii_ZeroVec {
_data: zerotrie::ZeroTrieSimpleAscii {
store: zerovec::ZeroVec::new(),
}
},
);
}
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
#[cfg_attr(feature = "zerofrom", derive(zerofrom::ZeroFrom))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = crate))]
struct DeriveTest_ZeroAsciiIgnoreCaseTrie_ZeroVec<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub _data: ZeroAsciiIgnoreCaseTrie<ZeroVec<'data, u8>>,
}
#[test]
#[cfg(all(feature = "databake", feature = "alloc"))]
fn bake_ZeroAsciiIgnoreCaseTrie_ZeroVec() {
use databake::*;
extern crate std;
test_bake!(
DeriveTest_ZeroAsciiIgnoreCaseTrie_ZeroVec<'static>,
crate::DeriveTest_ZeroAsciiIgnoreCaseTrie_ZeroVec {
_data: zerotrie::ZeroAsciiIgnoreCaseTrie {
store: zerovec::ZeroVec::new(),
}
},
);
}
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
#[cfg_attr(feature = "zerofrom", derive(zerofrom::ZeroFrom))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = crate))]
struct DeriveTest_ZeroTriePerfectHash_ZeroVec<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub _data: ZeroTriePerfectHash<ZeroVec<'data, u8>>,
}
#[test]
#[cfg(all(feature = "databake", feature = "alloc"))]
fn bake_ZeroTriePerfectHash_ZeroVec() {
use databake::*;
extern crate std;
test_bake!(
DeriveTest_ZeroTriePerfectHash_ZeroVec<'static>,
crate::DeriveTest_ZeroTriePerfectHash_ZeroVec {
_data: zerotrie::ZeroTriePerfectHash {
store: zerovec::ZeroVec::new(),
}
},
);
}
#[cfg_attr(feature = "yoke", derive(yoke::Yokeable))]
#[cfg_attr(feature = "zerofrom", derive(zerofrom::ZeroFrom))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[cfg_attr(feature = "databake", derive(databake::Bake))]
#[cfg_attr(feature = "databake", databake(path = crate))]
struct DeriveTest_ZeroTrieExtendedCapacity_ZeroVec<'data> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub _data: ZeroTrieExtendedCapacity<ZeroVec<'data, u8>>,
}
#[test]
#[cfg(all(feature = "databake", feature = "alloc"))]
fn bake_ZeroTrieExtendedCapacity_ZeroVec() {
use databake::*;
extern crate std;
test_bake!(
DeriveTest_ZeroTrieExtendedCapacity_ZeroVec<'static>,
crate::DeriveTest_ZeroTrieExtendedCapacity_ZeroVec {
_data: zerotrie::ZeroTrieExtendedCapacity {
store: zerovec::ZeroVec::new(),
}
},
);
}

View File

@@ -0,0 +1,46 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use zerotrie::ZeroAsciiIgnoreCaseTrie;
mod testdata {
include!("data/data.rs");
}
use testdata::strings_to_litemap;
#[test]
fn test_ignore_case_coverage() {
let litemap = strings_to_litemap(&["", "aBc", "aBcD", "aBce", "aBcF", "aBcghi"]);
// Test both construction paths
ZeroAsciiIgnoreCaseTrie::try_from(&litemap).unwrap();
let trie = litemap
.iter()
.map(|(k, v)| (*k, *v))
.collect::<ZeroAsciiIgnoreCaseTrie<Vec<u8>>>();
// Test lookup
for (k, v) in litemap.iter() {
assert_eq!(trie.get(k), Some(*v), "normal: {k:?}");
let k_upper = k
.iter()
.map(|c| c.to_ascii_uppercase())
.collect::<Vec<u8>>();
assert_eq!(trie.get(k_upper), Some(*v), "upper: {k:?}");
let k_lower = k
.iter()
.map(|c| c.to_ascii_lowercase())
.collect::<Vec<u8>>();
assert_eq!(trie.get(k_lower), Some(*v), "lower: {k:?}");
}
// Test mixed-case strings
let problematic_strs = &["A", "ab", "abc", "aBcd", "aBcgHi"];
for problematic_str in problematic_strs {
let mut litemap = litemap.clone();
litemap.insert(problematic_str.as_bytes(), 100);
ZeroAsciiIgnoreCaseTrie::try_from(&litemap).expect_err(problematic_str);
}
}

170
vendor/zerotrie/tests/locale_aux_test.rs vendored Normal file
View File

@@ -0,0 +1,170 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use icu_locale_core::extensions::private::Private;
use icu_locale_core::Locale;
use litemap::LiteMap;
use std::collections::BTreeSet;
use writeable::Writeable;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::VarZeroVec;
mod testdata {
include!("data/data.rs");
}
use testdata::locales_with_aux::{NUM_UNIQUE_BLOBS, STRINGS};
use testdata::strings_to_litemap;
#[test]
#[cfg(target_pointer_width = "64")]
fn test_combined() {
let litemap = strings_to_litemap(STRINGS);
let vzv: VarZeroVec<str> = STRINGS.into();
// Lookup table size:
assert_eq!(vzv.as_bytes().len(), 10219);
// Size including pointer array:
assert_eq!(
vzv.as_bytes().len() + STRINGS.len() * core::mem::size_of::<usize>(),
18635
);
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
// Lookup table size:
assert_eq!(trie.byte_len(), 5104);
// Size including pointer array:
assert_eq!(
trie.byte_len() + NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
8392
);
let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap();
// Lookup table size:
assert_eq!(trie.byte_len(), 5157);
// Size including pointer array:
assert_eq!(
trie.byte_len() + NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
8445
);
let total_str_len = litemap.keys().map(|k| k.len()).sum::<usize>();
assert_eq!(total_str_len, 8115);
// Lookup table size:
assert_eq!(
total_str_len + STRINGS.len() * core::mem::size_of::<usize>(),
16531
);
// Size including pointer array: (2x for the lookup array and value array)
assert_eq!(
total_str_len + 2 * STRINGS.len() * core::mem::size_of::<usize>(),
24947
);
// Size including u16 pointer array:
assert_eq!(
total_str_len
+ STRINGS.len() * core::mem::size_of::<usize>()
+ STRINGS.len() * core::mem::size_of::<u16>()
+ NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
21923
);
}
#[test]
#[cfg(target_pointer_width = "64")]
fn test_aux_split() {
let locales: Vec<Locale> = STRINGS.iter().map(|s| s.parse().unwrap()).collect();
let aux_keys: BTreeSet<&Private> = locales.iter().map(|l| &l.extensions.private).collect();
assert_eq!(aux_keys.len(), 6);
let mut cumulative_index = 0;
let mut total_simpleascii_len = 0;
let mut total_perfecthash_len = 0;
let mut total_vzv_len = 0;
let mut unique_locales = BTreeSet::new();
for private in aux_keys.iter() {
let current_locales: Vec<Locale> = locales
.iter()
.filter(|l| l.extensions.private == **private)
.map(|l| {
let mut l = l.clone();
l.extensions.private = Private::default();
l
})
.collect();
let litemap: LiteMap<Vec<u8>, usize> = current_locales
.iter()
.map(|l| {
(l.write_to_string().into_owned().into_bytes(), {
cumulative_index += 1;
cumulative_index - 1
})
})
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
total_simpleascii_len += trie.byte_len();
let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap();
total_perfecthash_len += trie.byte_len();
for k in litemap.keys() {
unique_locales.insert(k.clone());
}
let strs: Vec<String> = current_locales
.iter()
.map(|l| l.write_to_string().into_owned())
.collect();
let vzv: VarZeroVec<str> = strs.as_slice().into();
total_vzv_len += vzv.as_bytes().len();
}
assert_eq!(cumulative_index, locales.len());
assert_eq!(total_simpleascii_len, 5098);
assert_eq!(total_perfecthash_len, 5302);
assert_eq!(total_vzv_len, 5486);
let total_unique_locale_str_len = unique_locales.iter().map(|v| v.len()).sum::<usize>();
assert_eq!(total_unique_locale_str_len, 945);
// Size including pointer array:
assert_eq!(
total_simpleascii_len + NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
8386
);
assert_eq!(
total_perfecthash_len + NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
8590
);
assert_eq!(
total_vzv_len + STRINGS.len() * core::mem::size_of::<usize>(),
13902
);
// 2x for the lookup arrays and value arrays
assert_eq!(
total_unique_locale_str_len + 2 * STRINGS.len() * core::mem::size_of::<usize>(),
17777
);
// Size including u16 pointer array:
assert_eq!(
total_unique_locale_str_len
+ STRINGS.len() * core::mem::size_of::<usize>()
+ STRINGS.len() * core::mem::size_of::<u16>()
+ NUM_UNIQUE_BLOBS * core::mem::size_of::<usize>(),
14753
);
}