vendor/zerovec/src/hashmap/algorithms.rs

// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use alloc::vec;
use alloc::vec::Vec;
use core::hash::{Hash, Hasher};
use twox_hash::XxHash64;

// Const seed to be used with [`XxHash64::with_seed`].
const SEED: u64 = 0xaabbccdd;

/// Split the 64bit `hash` into (g, f0, f1).
///
/// g denotes the highest 16bits of the hash modulo `m`, and is referred to as first level hash.
/// (f0, f1) denotes the middle, and lower 24bits of the hash respectively.
/// (f0, f1) are used to distribute the keys with same g, into distinct slots.
///
/// # Arguments
///
/// * `hash` - The hash to split.
/// * `m` - The modulo used to split the hash.
pub const fn split_hash64(hash: u64, m: usize) -> (usize, u32, u32) {
    (
        ((hash >> 48) as usize % m),
        ((hash >> 24) as u32 & 0xffffff),
        ((hash & 0xffffff) as u32),
    )
}

/// Compute hash using [`XxHash64`].
pub fn compute_hash<K: Hash + ?Sized>(key: &K) -> u64 {
    let mut hasher = XxHash64::with_seed(SEED);
    key.hash(&mut hasher);
    hasher.finish()
}

/// Calculate the index using (f0, f1), (d0, d1) in modulo m.
/// Returns [`None`] if d is (0, 0) or modulo is 0
/// else returns the index computed using (f0 + f1 * d0 + d1) mod m.
pub fn compute_index(f: (u32, u32), d: (u32, u32), m: u32) -> Option<usize> {
    if d == (0, 0) || m == 0 {
        None
    } else {
        Some((f.1.wrapping_mul(d.0).wrapping_add(f.0).wrapping_add(d.1) % m) as usize)
    }
}

/// Compute displacements for the given `key_hashes`, which split the keys into distinct slots by a
/// two-level hashing schema.
///
/// Returns a tuple of where the first item is the displacement array and the second item is the
/// reverse mapping used to permute keys, values into their slots.
///
/// 1. Split the hashes into (g, f0, f1).
/// 2. Bucket and sort the split hash on g in descending order.
/// 3. In decreasing order of bucket size, try until a (d0, d1) is found that splits the keys
///    in the bucket into distinct slots.
/// 4. Mark the slots for current bucket as occupied and store the reverse mapping.
/// 5. Repeat untill all the keys have been assigned distinct slots.
///
/// # Arguments
///
/// * `key_hashes` - [`ExactSizeIterator`] over the hashed key values
#[expect(clippy::indexing_slicing, clippy::unwrap_used)]
pub fn compute_displacements(
    key_hashes: impl ExactSizeIterator<Item = u64>,
) -> (Vec<(u32, u32)>, Vec<usize>) {
    let len = key_hashes.len();

    // A vector to track the size of buckets for sorting.
    let mut bucket_sizes = vec![0; len];

    // A flattened representation of items in the buckets after applying first level hash function
    let mut bucket_flatten = Vec::with_capacity(len);

    // Compute initial displacement and bucket sizes

    key_hashes.into_iter().enumerate().for_each(|(i, kh)| {
        let h = split_hash64(kh, len);
        bucket_sizes[h.0] += 1;
        bucket_flatten.push((h, i))
    });

    // Sort by decreasing order of bucket_sizes.
    bucket_flatten.sort_by(|&(ha, _), &(hb, _)| {
        // ha.0, hb.0 are always within bounds of `bucket_sizes`
        (bucket_sizes[hb.0], hb).cmp(&(bucket_sizes[ha.0], ha))
    });

    // Generation count while iterating buckets.
    // Each trial of ((d0, d1), bucket chain) is a new generation.
    // We use this to track which all slots are assigned for the current bucket chain.
    let mut generation = 0;

    // Whether a slot has been occupied by previous buckets with a different first level hash (different
    // bucket chain).
    let mut occupied = vec![false; len];

    // Track generation count for the slots.
    // A slot is empty if either it is unoccupied by the previous bucket chains and the
    // assignment is not equal to generation.
    let mut assignments = vec![0; len];

    // Vec to store the displacements (saves us a recomputation of hash while assigning slots).
    let mut current_displacements = Vec::with_capacity(16);

    // (d0, d1) which splits the bucket into different slots
    let mut displacements = vec![(0, 0); len];

    // Vec to store mapping to the original order of keys.
    // This is a permutation which will be applied to keys, values at the end.
    let mut reverse_mapping = vec![0; len];

    let mut start = 0;
    while start < len {
        // Bucket span with the same first level hash
        // start is always within bounds of `bucket_flatten`
        let g = bucket_flatten[start].0 .0;
        // g is always within bounds of `bucket_sizes`
        let end = start + bucket_sizes[g];
        // start, end - 1 are always within bounds of `bucket_sizes`
        let buckets = &bucket_flatten[start..end];

        'd0: for d0 in 0..len as u32 {
            'd1: for d1 in 0..len as u32 {
                if (d0, d1) == (0, 0) {
                    continue;
                }
                current_displacements.clear();
                generation += 1;

                for ((_, f0, f1), _) in buckets {
                    let displacement_idx = compute_index((*f0, *f1), (d0, d1), len as u32).unwrap();

                    // displacement_idx is always within bounds
                    if occupied[displacement_idx] || assignments[displacement_idx] == generation {
                        continue 'd1;
                    }
                    assignments[displacement_idx] = generation;
                    current_displacements.push(displacement_idx);
                }

                // Successfully found a (d0, d1), store it as index g.
                // g < displacements.len() due to modulo operation
                displacements[g] = (d0, d1);

                for (i, displacement_idx) in current_displacements.iter().enumerate() {
                    // `current_displacements` has same size as `buckets`
                    let (_, idx) = &buckets[i];

                    // displacement_idx is always within bounds
                    occupied[*displacement_idx] = true;
                    reverse_mapping[*displacement_idx] = *idx;
                }
                break 'd0;
            }
        }

        start = end;
    }

    (displacements, reverse_mapping)
}
chore: checkpoint before Python removal 2026-03-26 22:33:59 +00:00			`// This file is part of ICU4X. For terms of use, please see the file`
			`// called LICENSE at the top level of the ICU4X source tree`
			`// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).`

			`use alloc::vec;`
			`use alloc::vec::Vec;`
			`use core::hash::{Hash, Hasher};`
			`use twox_hash::XxHash64;`

			// Const seed to be used with [`XxHash64::with_seed`].
			`const SEED: u64 = 0xaabbccdd;`

			/// Split the 64bit `hash` into (g, f0, f1).
			`///`
			/// g denotes the highest 16bits of the hash modulo `m`, and is referred to as first level hash.
			`/// (f0, f1) denotes the middle, and lower 24bits of the hash respectively.`
			`/// (f0, f1) are used to distribute the keys with same g, into distinct slots.`
			`///`
			`/// # Arguments`
			`///`
			/// * `hash` - The hash to split.
			/// * `m` - The modulo used to split the hash.
			`pub const fn split_hash64(hash: u64, m: usize) -> (usize, u32, u32) {`
			`(`
			`((hash >> 48) as usize % m),`
			`((hash >> 24) as u32 & 0xffffff),`
			`((hash & 0xffffff) as u32),`
			`)`
			`}`

			/// Compute hash using [`XxHash64`].
			`pub fn compute_hash<K: Hash + ?Sized>(key: &K) -> u64 {`
			`let mut hasher = XxHash64::with_seed(SEED);`
			`key.hash(&mut hasher);`
			`hasher.finish()`
			`}`

			`/// Calculate the index using (f0, f1), (d0, d1) in modulo m.`
			/// Returns [`None`] if d is (0, 0) or modulo is 0
			`/// else returns the index computed using (f0 + f1 * d0 + d1) mod m.`
			`pub fn compute_index(f: (u32, u32), d: (u32, u32), m: u32) -> Option<usize> {`
			`if d == (0, 0) \|\| m == 0 {`
			`None`
			`} else {`
			`Some((f.1.wrapping_mul(d.0).wrapping_add(f.0).wrapping_add(d.1) % m) as usize)`
			`}`
			`}`

			/// Compute displacements for the given `key_hashes`, which split the keys into distinct slots by a
			`/// two-level hashing schema.`
			`///`
			`/// Returns a tuple of where the first item is the displacement array and the second item is the`
			`/// reverse mapping used to permute keys, values into their slots.`
			`///`
			`/// 1. Split the hashes into (g, f0, f1).`
			`/// 2. Bucket and sort the split hash on g in descending order.`
			`/// 3. In decreasing order of bucket size, try until a (d0, d1) is found that splits the keys`
			`/// in the bucket into distinct slots.`
			`/// 4. Mark the slots for current bucket as occupied and store the reverse mapping.`
			`/// 5. Repeat untill all the keys have been assigned distinct slots.`
			`///`
			`/// # Arguments`
			`///`
			/// * `key_hashes` - [`ExactSizeIterator`] over the hashed key values
			`#[expect(clippy::indexing_slicing, clippy::unwrap_used)]`
			`pub fn compute_displacements(`
			`key_hashes: impl ExactSizeIterator<Item = u64>,`
			`) -> (Vec<(u32, u32)>, Vec<usize>) {`
			`let len = key_hashes.len();`

			`// A vector to track the size of buckets for sorting.`
			`let mut bucket_sizes = vec![0; len];`

			`// A flattened representation of items in the buckets after applying first level hash function`
			`let mut bucket_flatten = Vec::with_capacity(len);`

			`// Compute initial displacement and bucket sizes`

			`key_hashes.into_iter().enumerate().for_each(\|(i, kh)\| {`
			`let h = split_hash64(kh, len);`
			`bucket_sizes[h.0] += 1;`
			`bucket_flatten.push((h, i))`
			`});`

			`// Sort by decreasing order of bucket_sizes.`
			`bucket_flatten.sort_by(\|&(ha, _), &(hb, _)\| {`
			// ha.0, hb.0 are always within bounds of `bucket_sizes`
			`(bucket_sizes[hb.0], hb).cmp(&(bucket_sizes[ha.0], ha))`
			`});`

			`// Generation count while iterating buckets.`
			`// Each trial of ((d0, d1), bucket chain) is a new generation.`
			`// We use this to track which all slots are assigned for the current bucket chain.`
			`let mut generation = 0;`

			`// Whether a slot has been occupied by previous buckets with a different first level hash (different`
			`// bucket chain).`
			`let mut occupied = vec![false; len];`

			`// Track generation count for the slots.`
			`// A slot is empty if either it is unoccupied by the previous bucket chains and the`
			`// assignment is not equal to generation.`
			`let mut assignments = vec![0; len];`

			`// Vec to store the displacements (saves us a recomputation of hash while assigning slots).`
			`let mut current_displacements = Vec::with_capacity(16);`

			`// (d0, d1) which splits the bucket into different slots`
			`let mut displacements = vec![(0, 0); len];`

			`// Vec to store mapping to the original order of keys.`
			`// This is a permutation which will be applied to keys, values at the end.`
			`let mut reverse_mapping = vec![0; len];`

			`let mut start = 0;`
			`while start < len {`
			`// Bucket span with the same first level hash`
			// start is always within bounds of `bucket_flatten`
			`let g = bucket_flatten[start].0 .0;`
			// g is always within bounds of `bucket_sizes`
			`let end = start + bucket_sizes[g];`
			// start, end - 1 are always within bounds of `bucket_sizes`
			`let buckets = &bucket_flatten[start..end];`

			`'d0: for d0 in 0..len as u32 {`
			`'d1: for d1 in 0..len as u32 {`
			`if (d0, d1) == (0, 0) {`
			`continue;`
			`}`
			`current_displacements.clear();`
			`generation += 1;`

			`for ((_, f0, f1), _) in buckets {`
			`let displacement_idx = compute_index((f0, f1), (d0, d1), len as u32).unwrap();`

			`// displacement_idx is always within bounds`
			`if occupied[displacement_idx] \|\| assignments[displacement_idx] == generation {`
			`continue 'd1;`
			`}`
			`assignments[displacement_idx] = generation;`
			`current_displacements.push(displacement_idx);`
			`}`

			`// Successfully found a (d0, d1), store it as index g.`
			`// g < displacements.len() due to modulo operation`
			`displacements[g] = (d0, d1);`

			`for (i, displacement_idx) in current_displacements.iter().enumerate() {`
			// `current_displacements` has same size as `buckets`
			`let (_, idx) = &buckets[i];`

			`// displacement_idx is always within bounds`
			`occupied[*displacement_idx] = true;`
			`reverse_mapping[displacement_idx] = idx;`
			`}`
			`break 'd0;`
			`}`
			`}`

			`start = end;`
			`}`

			`(displacements, reverse_mapping)`
			`}`