// SPDX-License-Identifier: Apache-2.0 OR MIT

/*
128-bit atomic implementation on s390x.

This architecture provides the following 128-bit atomic instructions:

- LPQ/STPQ: load/store (arch1 or later, i.e., baseline)
- CDSG: CAS (arch1 or later, i.e., baseline)

See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and
detailed description of the atomic and synchronize instructions in this architecture:
https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#s390x

LLVM's minimal supported architecture level is arch8 (z10):
https://github.com/llvm/llvm-project/blob/llvmorg-22.1.0-rc1/llvm/lib/Target/SystemZ/SystemZProcessors.td#L16-L17
This does not appear to have changed since the current s390x backend was added in LLVM 3.3:
https://github.com/llvm/llvm-project/commit/5f613dfd1f7edb0ae95d521b7107b582d9df5103#diff-cbaef692b3958312e80fd5507a7e2aff071f1acb086f10e8a96bc06a7bb289db

Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
this module and use intrinsics.rs instead.

Refs:
- z/Architecture Principles of Operation, Fifteenth Edition (SA22-7832-14)
  https://www.ibm.com/docs/en/module_1678991624569/pdf/SA22-7832-14.pdf
- atomic-maybe-uninit
  https://github.com/taiki-e/atomic-maybe-uninit

See tests/asm-test/asm/portable-atomic for generated assembly.
*/

include!("macros.rs");

use core::{arch::asm, sync::atomic::Ordering};

use crate::utils::{Pair, U128};

// bcr 14,0 requires fast-BCR-serialization facility added in arch9 (z196).
#[cfg(any(
    target_feature = "fast-serialization",
    portable_atomic_target_feature = "fast-serialization",
))]
macro_rules! serialization {
    () => {
        "bcr 14, 0"
    };
}
#[cfg(not(any(
    target_feature = "fast-serialization",
    portable_atomic_target_feature = "fast-serialization",
)))]
macro_rules! serialization {
    () => {
        "bcr 15, 0"
    };
}

// Use distinct operands on z196 or later, otherwise split to lgr and $op.
#[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))]
macro_rules! distinct_op {
    ($op:tt, $a0:tt, $a1:tt, $a2:tt) => {
        concat!($op, "k ", $a0, ", ", $a1, ", ", $a2)
    };
}
#[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))]
macro_rules! distinct_op {
    ($op:tt, $a0:tt, $a1:tt, $a2:tt) => {
        concat!("lgr ", $a0, ", ", $a1, "\n", $op, " ", $a0, ", ", $a2)
    };
}

// Use selgr$cond on z15 or later, otherwise split to locgr$cond and $op.
#[cfg(any(
    target_feature = "miscellaneous-extensions-3",
    portable_atomic_target_feature = "miscellaneous-extensions-3",
))]
#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
macro_rules! select_op {
    ($cond:tt, $a0:tt, $a1:tt, $a2:tt) => {
        concat!("selgr", $cond, " ", $a0, ", ", $a1, ", ", $a2)
    };
}
#[cfg(not(any(
    target_feature = "miscellaneous-extensions-3",
    portable_atomic_target_feature = "miscellaneous-extensions-3",
)))]
#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
macro_rules! select_op {
    ($cond:tt, $a0:tt, $a1:tt, $a2:tt) => {
        concat!("lgr ", $a0, ", ", $a2, "\n", "locgr", $cond, " ", $a0, ", ", $a1)
    };
}

// Extracts and checks condition code.
#[inline]
fn extract_cc(r: i64) -> bool {
    r.wrapping_add(-268435456) & (1 << 31) != 0
}

#[inline]
unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
    debug_assert!(src as usize % 16 == 0);
    let (out_hi, out_lo);

    // SAFETY: the caller must uphold the safety contract.
    unsafe {
        // atomic load is always SeqCst.
        asm!(
            "lpq %r0, 0({src})", // atomic { r0:r1 = *src }
            src = in(reg) ptr_reg!(src),
            // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
            out("r0") out_hi,
            out("r1") out_lo,
            options(nostack, preserves_flags),
        );
        U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole
    }
}

#[inline]
unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
    debug_assert!(dst as usize % 16 == 0);
    let val = U128 { whole: val };

    // SAFETY: the caller must uphold the safety contract.
    unsafe {
        macro_rules! atomic_store {
            ($acquire:expr) => {
                asm!(
                    "stpq %r0, 0({dst})", // atomic { *dst = r0:r1 }
                    $acquire,             // fence
                    dst = in(reg) ptr_reg!(dst),
                    // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
                    in("r0") val.pair.hi,
                    in("r1") val.pair.lo,
                    options(nostack, preserves_flags),
                )
            };
        }
        match order {
            // Relaxed and Release stores are equivalent.
            Ordering::Relaxed | Ordering::Release => atomic_store!(""),
            Ordering::SeqCst => atomic_store!(serialization!()),
            _ => unreachable!(),
        }
    }
}

#[inline]
unsafe fn atomic_compare_exchange(
    dst: *mut u128,
    old: u128,
    new: u128,
    _success: Ordering,
    _failure: Ordering,
) -> Result<u128, u128> {
    debug_assert!(dst as usize % 16 == 0);
    let old = U128 { whole: old };
    let new = U128 { whole: new };
    let (prev_hi, prev_lo);
    let r;

    // SAFETY: the caller must uphold the safety contract.
    let prev = unsafe {
        // atomic CAS is always SeqCst.
        asm!(
            "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:13 } else { cc = 1; r0:r1 = *dst } }
            "ipm {r}",                  // r[:] = cc
            dst = in(reg) ptr_reg!(dst),
            r = lateout(reg) r,
            // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
            inout("r0") old.pair.hi => prev_hi,
            inout("r1") old.pair.lo => prev_lo,
            in("r12") new.pair.hi,
            in("r13") new.pair.lo,
            // Do not use `preserves_flags` because CDSG modifies the condition code.
            options(nostack),
        );
        U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole
    };
    if extract_cc(r) { Ok(prev) } else { Err(prev) }
}

// cdsg is always strong.
use self::atomic_compare_exchange as atomic_compare_exchange_weak;

// 128-bit atomic load by two 64-bit atomic loads.
#[cfg(not(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
)))]
#[inline]
unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 {
    // SAFETY: the caller must uphold the safety contract.
    unsafe {
        let (out_hi, out_lo);
        asm!(
            "lg {out_hi}, 8({src})", // atomic { out_hi = *src.byte_add(8) }
            "lg {out_lo}, 0({src})", // atomic { out_lo = *src }
            src = in(reg) src,
            out_hi = out(reg) out_hi,
            out_lo = out(reg) out_lo,
            options(pure, nostack, preserves_flags, readonly),
        );
        U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole
    }
}

#[cfg(not(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
)))]
#[inline(always)]
unsafe fn atomic_update<F>(dst: *mut u128, order: Ordering, mut f: F) -> u128
where
    F: FnMut(u128) -> u128,
{
    // SAFETY: the caller must uphold the safety contract.
    unsafe {
        // This is not single-copy atomic reads, but this is ok because subsequent
        // CAS will check for consistency.
        //
        // Note that the C++20 memory model does not allow mixed-sized atomic access,
        // so we must use inline assembly to implement byte_wise_atomic_load.
        // (i.e., byte-wise atomic based on the standard library's atomic types
        // cannot be used here).
        let mut prev = byte_wise_atomic_load(dst);
        loop {
            let next = f(prev);
            match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) {
                Ok(x) => return x,
                Err(x) => prev = x,
            }
        }
    }
}

#[inline]
unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
    debug_assert!(dst as usize % 16 == 0);
    let val = U128 { whole: val };
    let (mut prev_hi, mut prev_lo);

    // SAFETY: the caller must uphold the safety contract.
    //
    // We could use atomic_update here, but using an inline assembly allows omitting
    // the comparison of results and the storing/comparing of condition flags.
    //
    // Do not use atomic_rmw_cas_3 because it needs extra LGR to implement swap.
    unsafe {
        // atomic swap is always SeqCst.
        asm!(
            "lg %r0, 8({dst})",             // atomic { r0 = *dst.byte_add(8) }
            "lg %r1, 0({dst})",             // atomic { r1 = *dst }
            "2:", // 'retry:
                "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
                "jl 2b",                    // if cc == 1 { jump 'retry }
            dst = in(reg) ptr_reg!(dst),
            // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
            out("r0") prev_hi,
            out("r1") prev_lo,
            in("r12") val.pair.hi,
            in("r13") val.pair.lo,
            // Do not use `preserves_flags` because CDSG modifies the condition code.
            options(nostack),
        );
        U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole
    }
}

/// Atomic RMW by CAS loop (3 arguments)
/// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
///
/// `$op` can use the following registers:
/// - val_hi/val_lo pair: val argument (read-only for `$op`)
/// - r0/r1 pair: previous value loaded (read-only for `$op`)
/// - r12/r13 pair: new value that will be stored
// We could use atomic_update here, but using an inline assembly allows omitting
// the comparison of results and the storing/comparing of condition flags.
macro_rules! atomic_rmw_cas_3 {
    ($name:ident, [$($reg:tt)*], $($op:tt)*) => {
        #[inline]
        unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
            debug_assert!(dst as usize % 16 == 0);
            let val = U128 { whole: val };
            let (mut prev_hi, mut prev_lo);

            // SAFETY: the caller must uphold the safety contract.
            unsafe {
                // atomic RMW is always SeqCst.
                asm!(
                    "lg %r0, 8({dst})",             // atomic { r0 = *dst.byte_add(8) }
                    "lg %r1, 0({dst})",             // atomic { r1 = *dst }
                    "2:", // 'retry:
                        $($op)*
                        "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
                        "jl 2b",                    // if cc == 1 { jump 'retry }
                    dst = in(reg) ptr_reg!(dst),
                    val_hi = in(reg) val.pair.hi,
                    val_lo = in(reg) val.pair.lo,
                    $($reg)*
                    // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
                    out("r0") prev_hi,
                    out("r1") prev_lo,
                    out("r12") _,
                    out("r13") _,
                    // Do not use `preserves_flags` because CDSG modifies the condition code.
                    options(nostack),
                );
                U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole
            }
        }
    };
}
/// Atomic RMW by CAS loop (2 arguments)
/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
///
/// `$op` can use the following registers:
/// - r0/r1 pair: previous value loaded (read-only for `$op`)
/// - r12/r13 pair: new value that will be stored
// We could use atomic_update here, but using an inline assembly allows omitting
// the comparison of results and the storing/comparing of condition flags.
macro_rules! atomic_rmw_cas_2 {
    ($name:ident, [$($reg:tt)*], $($op:tt)*) => {
        #[inline]
        unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
            debug_assert!(dst as usize % 16 == 0);
            let (mut prev_hi, mut prev_lo);

            // SAFETY: the caller must uphold the safety contract.
            unsafe {
                // atomic RMW is always SeqCst.
                asm!(
                    "lg %r0, 8({dst})",             // atomic { r0 = *dst.byte_add(8) }
                    "lg %r1, 0({dst})",             // atomic { r1 = *dst }
                    "2:", // 'retry:
                        $($op)*
                        "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } }
                        "jl 2b",                    // if cc == 1 { jump 'retry }
                    dst = in(reg) ptr_reg!(dst),
                    $($reg)*
                    // Quadword atomic instructions work with even/odd pair of specified register and subsequent register.
                    out("r0") prev_hi,
                    out("r1") prev_lo,
                    out("r12") _,
                    out("r13") _,
                    // Do not use `preserves_flags` because CDSG modifies the condition code.
                    options(nostack),
                );
                U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole
            }
        }
    };
}

atomic_rmw_cas_3! {
    atomic_add, [],
    distinct_op!("algr", "%r13", "%r1", "{val_lo}"), // r13 = r1 + val_lo; cc = zero | carry
    "lgr %r12, %r0",                                 // r12 = r0
    "alcgr %r12, {val_hi}",                          // r12 += val_hi + carry
}
atomic_rmw_cas_3! {
    atomic_sub, [],
    distinct_op!("slgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 - val_lo; cc = zero | borrow
    "lgr %r12, %r0",                                 // r12 = r0
    "slbgr %r12, {val_hi}",                          // r12 -= val_hi + borrow
}
atomic_rmw_cas_3! {
    atomic_and, [],
    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo
    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi
}

// Use nngrk on z15 or later.
#[cfg(any(
    target_feature = "miscellaneous-extensions-3",
    portable_atomic_target_feature = "miscellaneous-extensions-3",
))]
atomic_rmw_cas_3! {
    atomic_nand, [],
    "nngrk %r13, %r1, {val_lo}", // r13 = !(r1 & val_lo)
    "nngrk %r12, %r0, {val_hi}", // r12 = !(r0 & val_hi)
}
#[cfg(not(any(
    target_feature = "miscellaneous-extensions-3",
    portable_atomic_target_feature = "miscellaneous-extensions-3",
)))]
atomic_rmw_cas_3! {
    atomic_nand, [],
    distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo
    distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi
    "lcgr %r13, %r13",                              // r13 = !r13 + 1
    "aghi %r13, -1",                                // r13 -= 1
    "lcgr %r12, %r12",                              // r12 = !r12 + 1
    "aghi %r12, -1",                                // r12 -= 1
}

atomic_rmw_cas_3! {
    atomic_or, [],
    distinct_op!("ogr", "%r13", "%r1", "{val_lo}"), // r13 = r1 | val_lo
    distinct_op!("ogr", "%r12", "%r0", "{val_hi}"), // r12 = r0 | val_hi
}
atomic_rmw_cas_3! {
    atomic_xor, [],
    distinct_op!("xgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 ^ val_lo
    distinct_op!("xgr", "%r12", "%r0", "{val_hi}"), // r12 = r0 ^ val_hi
}

#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
atomic_rmw_cas_3! {
    atomic_max, [],
    "clgr %r1, {val_lo}",                       // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
    select_op!("h", "%r12", "%r1", "{val_lo}"), // if cc == 2 { r12 = r1 } else { r12 = val_lo }
    "cgr %r0, {val_hi}",                        // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
    select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo }
    "locgre %r13, %r12",                        // if cc == 0 { r13 = r12 }
    select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi }
}
#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
atomic_rmw_cas_3! {
    atomic_umax, [tmp = out(reg) _,],
    "clgr %r1, {val_lo}",                        // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
    select_op!("h", "{tmp}", "%r1", "{val_lo}"), // if cc == 2 { tmp = r1 } else { tmp = val_lo }
    "clgr %r0, {val_hi}",                        // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 }
    select_op!("h", "%r12", "%r0", "{val_hi}"),  // if cc == 2 { r12 = r0 } else { r12 = val_hi }
    select_op!("h", "%r13", "%r1", "{val_lo}"),  // if cc == 2 { r13 = r1 } else { r13 = val_lo }
    "cgr %r0, {val_hi}",                         // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
    "locgre %r13, {tmp}",                        // if cc == 0 { r13 = tmp }
}
#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
atomic_rmw_cas_3! {
    atomic_min, [],
    "clgr %r1, {val_lo}",                       // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
    select_op!("l", "%r12", "%r1", "{val_lo}"), // if cc == 1 { r12 = r1 } else { r12 = val_lo }
    "cgr %r0, {val_hi}",                        // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
    select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo }
    "locgre %r13, %r12",                        // if cc == 0 { r13 = r12 }
    select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi }
}
#[cfg(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
))]
atomic_rmw_cas_3! {
    atomic_umin, [tmp = out(reg) _,],
    "clgr %r1, {val_lo}",                        // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 }
    select_op!("l", "{tmp}", "%r1", "{val_lo}"), // if cc == 1 { tmp = r1 } else { tmp = val_lo }
    "clgr %r0, {val_hi}",                        // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 }
    select_op!("l", "%r12", "%r0", "{val_hi}"),  // if cc == 1 { r12 = r0 } else { r12 = val_hi }
    select_op!("l", "%r13", "%r1", "{val_lo}"),  // if cc == 1 { r13 = r1 } else { r13 = val_lo }
    "cgr %r0, {val_hi}",                         // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 }
    "locgre %r13, {tmp}",                        // if cc == 0 { r13 = tmp }
}
// We use atomic_update for atomic min/max on pre-z196 because
// z10 doesn't seem to have a good way to implement 128-bit min/max.
// loc{,g}r requires z196 or later.
// https://godbolt.org/z/EqoMEP8b3
#[cfg(not(any(
    target_feature = "load-store-on-cond",
    portable_atomic_target_feature = "load-store-on-cond",
)))]
atomic_rmw_by_atomic_update!(cmp);

atomic_rmw_cas_2! {
    atomic_not, [],
    "lcgr %r13, %r1", // r13 = !r1 + 1
    "aghi %r13, -1",  // r13 -= 1
    "lcgr %r12, %r0", // r12 = !r0 + 1
    "aghi %r12, -1",  // r12 -= 1
}

#[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))]
atomic_rmw_cas_2! {
    atomic_neg, [zero = in(reg) 0_u64,],
    "slgrk %r13, {zero}, %r1", // r13 = 0 - r1; cc = zero | borrow
    "lghi %r12, 0",            // r12 = 0
    "slbgr %r12, %r0",         // r12 -= r0 + borrow
}
#[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))]
atomic_rmw_cas_2! {
    atomic_neg, [],
    "lghi %r13, 0",    // r13 = 0
    "slgr %r13, %r1",  // r13 -= r1; cc = zero | borrow
    "lghi %r12, 0",    // r12 = 0
    "slbgr %r12, %r0", // r12 -= r0 + borrow
}

#[inline]
const fn is_lock_free() -> bool {
    IS_ALWAYS_LOCK_FREE
}
const IS_ALWAYS_LOCK_FREE: bool = true;

atomic128!(AtomicI128, i128, atomic_max, atomic_min);
atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);

#[cfg(test)]
mod tests {
    use super::*;

    test_atomic_int!(i128);
    test_atomic_int!(u128);

    // load/store/swap implementation is not affected by signedness, so it is
    // enough to test only unsigned types.
    stress_test!(u128);
}