Files
cli/vendor/zerotrie/tests/builder_test.rs

856 lines
32 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use litemap::LiteMap;
use zerotrie::ZeroTriePerfectHash;
use zerotrie::ZeroTrieSimpleAscii;
mod testdata {
include!("data/data.rs");
}
use testdata::strings_to_litemap;
const NON_EXISTENT_STRINGS: &[&str] = &[
"a9PS", "ahsY", "ahBO", "a8IN", "xk8o", "xv1l", "xI2S", "618y", "d6My", "uszy",
];
macro_rules! assert_bytes_eq {
($len:literal, $a:expr, $b:expr) => {
assert_eq!($len, $a.len());
assert_eq!($a, $b);
};
}
fn check_simple_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTrieSimpleAscii<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v));
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None);
}
// Check that the iterator returns items in the same order as the LiteMap
assert!(items
.iter()
.map(|(s, v)| (String::from_utf8(s.to_vec()).unwrap(), *v))
.eq(trie.iter()));
// Check that the const builder works
let const_trie = ZeroTrieSimpleAscii::try_from_litemap_with_const_builder(items).unwrap();
assert_eq!(trie.as_bytes(), const_trie.as_bytes());
}
fn check_phf_ascii_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v));
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None);
}
// Check that the iterator returns the contents of the LiteMap
// Note: Since the items might not be in order, we collect them into a new LiteMap
let recovered_items: LiteMap<_, _> = trie.iter().collect();
assert_eq!(
items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
recovered_items.to_borrowed_keys_values()
);
}
fn check_phf_bytes_trie<S>(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash<S>)
where
S: AsRef<[u8]> + ?Sized,
{
// Check that each item is in the trie
for (k, v) in items.iter() {
assert_eq!(trie.get(k), Some(*v), "{k:?}");
}
// Check that some items are not in the trie
for s in NON_EXISTENT_STRINGS.iter() {
assert_eq!(trie.get(s.as_bytes()), None, "{s:?}");
}
// Check that the iterator returns the contents of the LiteMap
// Note: Since the items might not be in order, we collect them into a new LiteMap
let recovered_items: LiteMap<_, _> = trie.iter().collect();
assert_eq!(
items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(),
recovered_items.to_borrowed_keys_values()
);
}
#[test]
fn test_basic() {
let lm1a: LiteMap<&[u8], usize> = testdata::basic::DATA_ASCII.iter().copied().collect();
let lm1b: LiteMap<&[u8], usize> = lm1a.to_borrowed_keys();
let lm2: LiteMap<&[u8], usize> = testdata::basic::DATA_UNICODE.iter().copied().collect();
let lm3: LiteMap<&[u8], usize> = testdata::basic::DATA_BINARY.iter().copied().collect();
let expected_bytes = testdata::basic::TRIE_ASCII;
let trie = ZeroTrieSimpleAscii::try_from(&lm1a).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_simple_ascii_trie(&lm1a, &trie);
let trie = ZeroTriePerfectHash::try_from(&lm1b).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_phf_ascii_trie(&lm1a, &trie);
let expected_bytes = testdata::basic::TRIE_UNICODE;
let trie = ZeroTriePerfectHash::try_from(&lm2).unwrap();
assert_bytes_eq!(39, trie.as_bytes(), expected_bytes);
check_phf_bytes_trie(&lm2, &trie);
let expected_bytes = testdata::basic::TRIE_BINARY;
let trie = ZeroTriePerfectHash::try_from(&lm3).unwrap();
assert_bytes_eq!(26, trie.as_bytes(), expected_bytes);
check_phf_bytes_trie(&lm3, &trie);
}
#[test]
fn test_empty() {
let trie = ZeroTrieSimpleAscii::try_from(&LiteMap::<&[u8], usize>::new_vec()).unwrap();
assert_eq!(trie.byte_len(), 0);
assert!(trie.is_empty());
assert_eq!(trie.get(b""), None);
assert_eq!(trie.as_bytes(), &[]);
}
#[test]
fn test_single_empty_value() {
let litemap: LiteMap<&[u8], usize> = [
(&b""[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), Some(10));
assert_eq!(trie.get(b"x"), None);
let expected_bytes = &[0b10001010];
assert_eq!(trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(1, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_byte_string() {
let litemap: LiteMap<&[u8], usize> = [
(&b"x"[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xy"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', 0b10001010];
assert_bytes_eq!(2, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(2, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_string() {
let litemap: LiteMap<&[u8], usize> = [
(&b"xyz"[..], 10), //
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"x"), None);
assert_eq!(trie.get(b"xy"), None);
assert_eq!(trie.get(b"xyzz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', b'y', b'z', 0b10001010];
assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_prefix_strings() {
let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"xy", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xyz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[b'x', 0b10000000, b'y', 0b10000001];
assert_bytes_eq!(4, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_single_byte_branch() {
let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"y", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"xy"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[0b11000010, b'x', b'y', 1, 0b10000000, 0b10000001];
assert_bytes_eq!(6, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(6, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_multi_byte_branch() {
let litemap: LiteMap<&[u8], usize> = [(&b"axb"[..], 0), (b"ayc", 1)].into_iter().collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"a"), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[
b'a', 0b11000010, b'x', b'y', 2, b'b', 0b10000000, b'c', 0b10000001,
];
assert_bytes_eq!(9, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(9, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_linear_varint_values() {
let litemap: LiteMap<&[u8], usize> = [(&b""[..], 100), (b"x", 500), (b"xyz", 5000)]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b"xy"), None);
assert_eq!(trie.get(b"xz"), None);
assert_eq!(trie.get(b"xyzz"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[0x90, 0x54, b'x', 0x93, 0x64, b'y', b'z', 0x90, 0x96, 0x78];
assert_bytes_eq!(10, trie.as_bytes(), expected_bytes);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(10, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_bug() {
let litemap: LiteMap<&[u8], usize> = [(&b"abc"[..], 100), (b"abcd", 500), (b"abcde", 5000)]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b"ab"), None);
assert_eq!(trie.get(b"abd"), None);
assert_eq!(trie.get(b"abCD"), None);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_varint_branch() {
let chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
let litemap: LiteMap<&[u8], usize> = (0..chars.len())
.map(|i| (chars.get(i..i + 1).unwrap().as_bytes(), i))
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100000, // branch varint lead
0x14, // branch varint trail
// search array:
b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J',
b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T',
b'U', b'V', b'W', b'X', b'Y', b'Z',
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z',
// offset array:
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20,
22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52,
54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84,
86,
// single-byte values:
0x80, (0x80 | 1), (0x80 | 2), (0x80 | 3), (0x80 | 4),
(0x80 | 5), (0x80 | 6), (0x80 | 7), (0x80 | 8), (0x80 | 9),
(0x80 | 10), (0x80 | 11), (0x80 | 12), (0x80 | 13), (0x80 | 14),
(0x80 | 15),
// multi-byte values:
0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x90, 10, 0x90, 11,
0x90, 12, 0x90, 13, 0x90, 14, 0x90, 15, 0x90, 16, 0x90, 17,
0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
0x90, 24, 0x90, 25, 0x90, 26, 0x90, 27, 0x90, 28, 0x90, 29,
0x90, 30, 0x90, 31, 0x90, 32, 0x90, 33, 0x90, 34, 0x90, 35,
];
assert_bytes_eq!(193, trie.as_bytes(), expected_bytes);
#[rustfmt::skip]
let expected_bytes = &[
0b11100000, // branch varint lead
0x14, // branch varint trail
// PHF metadata:
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8,
4, 4, 4, 16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 7,
// search array:
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'u', b'v', b'w', b'D', b'E', b'F', b'q',
b'r', b'A', b'B', b'C', b'x', b'y', b'z', b's',
b'H', b'I', b'J', b'G', b'P', b'Q', b'R', b'S',
b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'K',
b'L', b'M', b'N', b'O', b'g', b'a', b'b', b'c',
b't', b'd', b'f', b'e',
// offset array:
2, 4, 6, 8, 10, 12, 14,
16, 18, 20, 22, 24, 25, 26, 27,
29, 31, 32, 33, 34, 36, 38, 40,
42, 43, 44, 45, 46, 47, 49, 51,
53, 55, 57, 59, 61, 63, 65, 67,
68, 69, 70, 71, 72, 74, 76, 78,
80, 82, 84, 86,
// values:
0x90, 17, 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23,
0x90, 24, 0x90, 25, 0x90, 30, 0x90, 31, 0x90, 32, 0x80 | 3, 0x80 | 4,
0x80 | 5, 0x90, 26, 0x90, 27, 0x80, 0x80 | 1, 0x80 | 2, 0x90, 33,
0x90, 34, 0x90, 35, 0x90, 28, 0x80 | 7, 0x80 | 8, 0x80 | 9, 0x80 | 6,
0x80 | 15, 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5,
0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x80 | 10, 0x80 | 11, 0x80 | 12,
0x80 | 13, 0x80 | 14, 0x90, 16, 0x90, 10, 0x90, 11, 0x90, 12, 0x90, 29,
0x90, 13, 0x90, 15, 0x90, 14,
];
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(246, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
}
#[test]
fn test_below_wide() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcd", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11001010, // branch
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array:
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x8A,
];
assert_bytes_eq!(275, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_at_wide() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcde", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100001, // branch lead
0x6A, // branch trail
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array (wide):
0, 0, 0, 0, 0, 0, 0, 0, 0,
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x8A,
];
assert_bytes_eq!(286, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_at_wide_plus() {
let litemap: LiteMap<&[u8], usize> = [
(&b"abcdefghijklmnopqrstuvwxyz"[..], 1),
(b"bcdefghijklmnopqrstuvwxyza", 2),
(b"cdefghijklmnopqrstuvwxyzab", 3),
(b"defghijklmnopqrstuvwxyzabc", 4),
(b"efghijklmnopqrstuvwxyzabcd", 5),
(b"fghijklmnopqrstuvwxyzabcde", 6),
(b"ghijklmnopqrstuvwxyzabcdef", 7),
(b"hijklmnopqrstuvwxyzabcdefg", 8),
(b"ijklmnopqrstuvwxyzabcdefgh", 9),
(b"jklmnopqrstuvwxyzabcdef", 10),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), None);
assert_eq!(trie.get(b"abc"), None);
check_simple_ascii_trie(&litemap, &trie);
#[rustfmt::skip]
let expected_bytes = &[
0b11100001, // branch lead
0x6A, // branch trail
// search array:
b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
// offset array (wide):
0, 0, 0, 0, 0, 0, 0, 0, 0,
26, 52, 78, 104, 130, 156, 182, 208, 234,
// offset data:
b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n',
b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z',
0x81,
b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a',
0x82,
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p',
b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b',
0x83,
b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q',
b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c',
0x84,
b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r',
b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd',
0x85,
b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's',
b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e',
0x86,
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't',
b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x87,
b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u',
b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
0x88,
b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h',
0x89,
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f',
0x8A,
];
assert_bytes_eq!(287, trie.as_bytes(), expected_bytes);
}
#[test]
fn test_everything() {
let litemap: LiteMap<&[u8], usize> = [
(&b""[..], 0),
(b"axb", 100),
(b"ayc", 2),
(b"azd", 3),
(b"bxe", 4),
(b"bxefg", 500),
(b"bxefh", 6),
(b"bxei", 7),
(b"bxeikl", 8),
]
.into_iter()
.collect();
let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap();
assert_eq!(trie.get(b""), Some(0));
assert_eq!(trie.get(b"a"), None);
assert_eq!(trie.get(b"ax"), None);
assert_eq!(trie.get(b"ay"), None);
check_simple_ascii_trie(&litemap, &trie);
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', //
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
];
assert_bytes_eq!(36, trie.as_bytes(), expected_bytes);
#[rustfmt::skip]
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // start of 'a' subtree: branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', // start of 'b' subtree
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
];
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_bytes_eq!(36, trie_phf.as_bytes(), expected_bytes);
check_phf_ascii_trie(&litemap, &trie_phf);
let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 88);
let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 61);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 161);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 134);
}
macro_rules! utf8_byte {
($ch:expr, $i:literal) => {{
let mut utf8_encoder_buf = [0u8; 4];
$ch.encode_utf8(&mut utf8_encoder_buf);
utf8_encoder_buf[$i]
}};
}
#[test]
fn test_non_ascii() {
let litemap: LiteMap<&[u8], usize> = [
("".as_bytes(), 0),
("axb".as_bytes(), 100),
("ayc".as_bytes(), 2),
("azd".as_bytes(), 3),
("bxe".as_bytes(), 4),
("bxefg".as_bytes(), 500),
("bxefh".as_bytes(), 6),
("bxei".as_bytes(), 7),
("bxeikl".as_bytes(), 8),
("bxeiklmΚαλημέρααα".as_bytes(), 9),
("bxeiklmαnλo".as_bytes(), 10),
("bxeiklmη".as_bytes(), 11),
]
.into_iter()
.collect();
#[rustfmt::skip]
let expected_bytes = &[
0b10000000, // value 0
0b11000010, // branch of 2
b'a', //
b'b', //
13, //
0b11000011, // start of 'a' subtree: branch of 3
b'x', //
b'y', //
b'z', //
3, //
5, //
b'b', //
0b10010000, // value 100 (lead)
0x54, // value 100 (trail)
b'c', //
0b10000010, // value 2
b'd', //
0b10000011, // value 3
b'x', // start of 'b' subtree
b'e', //
0b10000100, // value 4
0b11000010, // branch of 2
b'f', //
b'i', //
7, //
0b11000010, // branch of 2
b'g', //
b'h', //
2, //
0b10010011, // value 500 (lead)
0x64, // value 500 (trail)
0b10000110, // value 6
0b10000111, // value 7
b'k', //
b'l', //
0b10001000, // value 8
b'm', //
0b10100001, // span of length 1
utf8_byte!('Κ', 0), // NOTE: all three letters have the same lead byte
0b11000011, // branch of 3
utf8_byte!('Κ', 1),
utf8_byte!('α', 1),
utf8_byte!('η', 1),
21,
27,
0b10110000, // span of length 18 (lead)
0b00000010, // span of length 18 (trail)
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('λ', 0),
utf8_byte!('λ', 1),
utf8_byte!('η', 0),
utf8_byte!('η', 1),
utf8_byte!('μ', 0),
utf8_byte!('μ', 1),
utf8_byte!('έ', 0),
utf8_byte!('έ', 1),
utf8_byte!('ρ', 0),
utf8_byte!('ρ', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
utf8_byte!('α', 0),
utf8_byte!('α', 1),
0b10001001, // value 9
b'n',
0b10100010, // span of length 2
utf8_byte!('λ', 0),
utf8_byte!('λ', 1),
b'o',
0b10001010, // value 10
0b10001011, // value 11
];
let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
assert_bytes_eq!(73, trie_phf.as_bytes(), expected_bytes);
check_phf_bytes_trie(&litemap, &trie_phf);
}
#[test]
fn test_max_branch() {
// Evaluate a branch with all 256 possible children
let mut litemap: LiteMap<&[u8], usize> = LiteMap::new_vec();
let all_bytes: Vec<u8> = (u8::MIN..=u8::MAX).collect();
assert_eq!(all_bytes.len(), 256);
let all_bytes_prefixed: Vec<[u8; 2]> = (u8::MIN..=u8::MAX).map(|x| [b'\0', x]).collect();
for b in all_bytes.iter() {
litemap.insert(core::slice::from_ref(b), *b as usize);
}
for s in all_bytes_prefixed.iter() {
litemap.insert(s, s[1] as usize);
}
let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap();
assert_eq!(trie_phf.byte_len(), 3042);
check_phf_bytes_trie(&litemap, &trie_phf);
}
#[test]
fn test_short_subtags_10pct() {
let litemap = strings_to_litemap(testdata::short_subtags_10pct::STRINGS);
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
assert_eq!(trie.byte_len(), 1050);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_eq!(trie_phf.byte_len(), 1100);
check_phf_ascii_trie(&litemap, &trie_phf);
let zhm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 1890);
let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 1326);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 3396);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 2832);
}
#[test]
fn test_short_subtags() {
let litemap = strings_to_litemap(testdata::short_subtags::STRINGS);
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
assert_eq!(trie.byte_len(), 8793);
check_simple_ascii_trie(&litemap, &trie);
let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>();
let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap();
assert_eq!(trie_phf.byte_len(), 9400);
check_phf_ascii_trie(&litemap, &trie_phf);
let zm: zerovec::ZeroMap<[u8], u32> = litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zm).unwrap();
assert_eq!(zhm_buf.len(), 18931);
let zm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zm).unwrap();
assert_eq!(zhm_buf.len(), 13300);
let zhm: zerovec::ZeroHashMap<[u8], u32> =
litemap.iter().map(|(a, b)| (*a, *b as u32)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 33949);
let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect();
let zhm_buf = postcard::to_allocvec(&zhm).unwrap();
assert_eq!(zhm_buf.len(), 28318);
}