232 lines
7.2 KiB
Rust
232 lines
7.2 KiB
Rust
// Adapted from https://github.com/Alexhuszagh/rust-lexical.
|
|
|
|
//! Defines rounding schemes for floating-point numbers.
|
|
|
|
use super::float::ExtendedFloat;
|
|
use super::num::*;
|
|
use super::shift::*;
|
|
use core::mem;
|
|
|
|
// MASKS
|
|
|
|
/// Calculate a scalar factor of 2 above the halfway point.
|
|
#[inline]
|
|
pub(crate) fn nth_bit(n: u64) -> u64 {
|
|
let bits: u64 = mem::size_of::<u64>() as u64 * 8;
|
|
debug_assert!(n < bits, "nth_bit() overflow in shl.");
|
|
|
|
1 << n
|
|
}
|
|
|
|
/// Generate a bitwise mask for the lower `n` bits.
|
|
#[inline]
|
|
pub(crate) fn lower_n_mask(n: u64) -> u64 {
|
|
let bits: u64 = mem::size_of::<u64>() as u64 * 8;
|
|
debug_assert!(n <= bits, "lower_n_mask() overflow in shl.");
|
|
|
|
if n == bits {
|
|
u64::MAX
|
|
} else {
|
|
(1 << n) - 1
|
|
}
|
|
}
|
|
|
|
/// Calculate the halfway point for the lower `n` bits.
|
|
#[inline]
|
|
pub(crate) fn lower_n_halfway(n: u64) -> u64 {
|
|
let bits: u64 = mem::size_of::<u64>() as u64 * 8;
|
|
debug_assert!(n <= bits, "lower_n_halfway() overflow in shl.");
|
|
|
|
if n == 0 {
|
|
0
|
|
} else {
|
|
nth_bit(n - 1)
|
|
}
|
|
}
|
|
|
|
/// Calculate a bitwise mask with `n` 1 bits starting at the `bit` position.
|
|
#[inline]
|
|
pub(crate) fn internal_n_mask(bit: u64, n: u64) -> u64 {
|
|
let bits: u64 = mem::size_of::<u64>() as u64 * 8;
|
|
debug_assert!(bit <= bits, "internal_n_halfway() overflow in shl.");
|
|
debug_assert!(n <= bits, "internal_n_halfway() overflow in shl.");
|
|
debug_assert!(bit >= n, "internal_n_halfway() overflow in sub.");
|
|
|
|
lower_n_mask(bit) ^ lower_n_mask(bit - n)
|
|
}
|
|
|
|
// NEAREST ROUNDING
|
|
|
|
// Shift right N-bytes and round to the nearest.
|
|
//
|
|
// Return if we are above halfway and if we are halfway.
|
|
#[inline]
|
|
pub(crate) fn round_nearest(fp: &mut ExtendedFloat, shift: i32) -> (bool, bool) {
|
|
// Extract the truncated bits using mask.
|
|
// Calculate if the value of the truncated bits are either above
|
|
// the mid-way point, or equal to it.
|
|
//
|
|
// For example, for 4 truncated bytes, the mask would be b1111
|
|
// and the midway point would be b1000.
|
|
let mask: u64 = lower_n_mask(shift as u64);
|
|
let halfway: u64 = lower_n_halfway(shift as u64);
|
|
|
|
let truncated_bits = fp.mant & mask;
|
|
let is_above = truncated_bits > halfway;
|
|
let is_halfway = truncated_bits == halfway;
|
|
|
|
// Bit shift so the leading bit is in the hidden bit.
|
|
overflowing_shr(fp, shift);
|
|
|
|
(is_above, is_halfway)
|
|
}
|
|
|
|
// Tie rounded floating point to event.
|
|
#[inline]
|
|
pub(crate) fn tie_even(fp: &mut ExtendedFloat, is_above: bool, is_halfway: bool) {
|
|
// Extract the last bit after shifting (and determine if it is odd).
|
|
let is_odd = fp.mant & 1 == 1;
|
|
|
|
// Calculate if we need to roundup.
|
|
// We need to roundup if we are above halfway, or if we are odd
|
|
// and at half-way (need to tie-to-even).
|
|
if is_above || (is_odd && is_halfway) {
|
|
fp.mant += 1;
|
|
}
|
|
}
|
|
|
|
// Shift right N-bytes and round nearest, tie-to-even.
|
|
//
|
|
// Floating-point arithmetic uses round to nearest, ties to even,
|
|
// which rounds to the nearest value, if the value is halfway in between,
|
|
// round to an even value.
|
|
#[inline]
|
|
pub(crate) fn round_nearest_tie_even(fp: &mut ExtendedFloat, shift: i32) {
|
|
let (is_above, is_halfway) = round_nearest(fp, shift);
|
|
tie_even(fp, is_above, is_halfway);
|
|
}
|
|
|
|
// DIRECTED ROUNDING
|
|
|
|
// Shift right N-bytes and round towards a direction.
|
|
//
|
|
// Return if we have any truncated bytes.
|
|
#[inline]
|
|
fn round_toward(fp: &mut ExtendedFloat, shift: i32) -> bool {
|
|
let mask: u64 = lower_n_mask(shift as u64);
|
|
let truncated_bits = fp.mant & mask;
|
|
|
|
// Bit shift so the leading bit is in the hidden bit.
|
|
overflowing_shr(fp, shift);
|
|
|
|
truncated_bits != 0
|
|
}
|
|
|
|
// Round down.
|
|
#[inline]
|
|
fn downard(_: &mut ExtendedFloat, _: bool) {}
|
|
|
|
// Shift right N-bytes and round toward zero.
|
|
//
|
|
// Floating-point arithmetic defines round toward zero, which rounds
|
|
// towards positive zero.
|
|
#[inline]
|
|
pub(crate) fn round_downward(fp: &mut ExtendedFloat, shift: i32) {
|
|
// Bit shift so the leading bit is in the hidden bit.
|
|
// No rounding schemes, so we just ignore everything else.
|
|
let is_truncated = round_toward(fp, shift);
|
|
downard(fp, is_truncated);
|
|
}
|
|
|
|
// ROUND TO FLOAT
|
|
|
|
// Shift the ExtendedFloat fraction to the fraction bits in a native float.
|
|
//
|
|
// Floating-point arithmetic uses round to nearest, ties to even,
|
|
// which rounds to the nearest value, if the value is halfway in between,
|
|
// round to an even value.
|
|
#[inline]
|
|
pub(crate) fn round_to_float<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
|
|
where
|
|
F: Float,
|
|
Algorithm: FnOnce(&mut ExtendedFloat, i32),
|
|
{
|
|
// Calculate the difference to allow a single calculation
|
|
// rather than a loop, to minimize the number of ops required.
|
|
// This does underflow detection.
|
|
let final_exp = fp.exp + F::DEFAULT_SHIFT;
|
|
if final_exp < F::DENORMAL_EXPONENT {
|
|
// We would end up with a denormal exponent, try to round to more
|
|
// digits. Only shift right if we can avoid zeroing out the value,
|
|
// which requires the exponent diff to be < M::BITS. The value
|
|
// is already normalized, so we shouldn't have any issue zeroing
|
|
// out the value.
|
|
let diff = F::DENORMAL_EXPONENT - fp.exp;
|
|
if diff <= u64::FULL {
|
|
// We can avoid underflow, can get a valid representation.
|
|
algorithm(fp, diff);
|
|
} else {
|
|
// Certain underflow, assign literal 0s.
|
|
fp.mant = 0;
|
|
fp.exp = 0;
|
|
}
|
|
} else {
|
|
algorithm(fp, F::DEFAULT_SHIFT);
|
|
}
|
|
|
|
if fp.mant & F::CARRY_MASK == F::CARRY_MASK {
|
|
// Roundup carried over to 1 past the hidden bit.
|
|
shr(fp, 1);
|
|
}
|
|
}
|
|
|
|
// AVOID OVERFLOW/UNDERFLOW
|
|
|
|
// Avoid overflow for large values, shift left as needed.
|
|
//
|
|
// Shift until a 1-bit is in the hidden bit, if the mantissa is not 0.
|
|
#[inline]
|
|
pub(crate) fn avoid_overflow<F>(fp: &mut ExtendedFloat)
|
|
where
|
|
F: Float,
|
|
{
|
|
// Calculate the difference to allow a single calculation
|
|
// rather than a loop, minimizing the number of ops required.
|
|
if fp.exp >= F::MAX_EXPONENT {
|
|
let diff = fp.exp - F::MAX_EXPONENT;
|
|
if diff <= F::MANTISSA_SIZE {
|
|
// Our overflow mask needs to start at the hidden bit, or at
|
|
// `F::MANTISSA_SIZE+1`, and needs to have `diff+1` bits set,
|
|
// to see if our value overflows.
|
|
let bit = (F::MANTISSA_SIZE + 1) as u64;
|
|
let n = (diff + 1) as u64;
|
|
let mask = internal_n_mask(bit, n);
|
|
if (fp.mant & mask) == 0 {
|
|
// If we have no 1-bit in the hidden-bit position,
|
|
// which is index 0, we need to shift 1.
|
|
let shift = diff + 1;
|
|
shl(fp, shift);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ROUND TO NATIVE
|
|
|
|
// Round an extended-precision float to a native float representation.
|
|
#[inline]
|
|
pub(crate) fn round_to_native<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
|
|
where
|
|
F: Float,
|
|
Algorithm: FnOnce(&mut ExtendedFloat, i32),
|
|
{
|
|
// Shift all the way left, to ensure a consistent representation.
|
|
// The following right-shifts do not work for a non-normalized number.
|
|
fp.normalize();
|
|
|
|
// Round so the fraction is in a native mantissa representation,
|
|
// and avoid overflow/underflow.
|
|
round_to_float::<F, _>(fp, algorithm);
|
|
avoid_overflow::<F>(fp);
|
|
}
|