// SPDX-License-Identifier: Apache-2.0 OR MIT /* 128-bit atomic implementation on s390x. This architecture provides the following 128-bit atomic instructions: - LPQ/STPQ: load/store (arch1 or later, i.e., baseline) - CDSG: CAS (arch1 or later, i.e., baseline) See "Atomic operation overview by architecture" in atomic-maybe-uninit for a more comprehensive and detailed description of the atomic and synchronize instructions in this architecture: https://github.com/taiki-e/atomic-maybe-uninit/blob/HEAD/src/arch/README.md#s390x LLVM's minimal supported architecture level is arch8 (z10): https://github.com/llvm/llvm-project/blob/llvmorg-22.1.0-rc1/llvm/lib/Target/SystemZ/SystemZProcessors.td#L16-L17 This does not appear to have changed since the current s390x backend was added in LLVM 3.3: https://github.com/llvm/llvm-project/commit/5f613dfd1f7edb0ae95d521b7107b582d9df5103#diff-cbaef692b3958312e80fd5507a7e2aff071f1acb086f10e8a96bc06a7bb289db Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use this module and use intrinsics.rs instead. Refs: - z/Architecture Principles of Operation, Fifteenth Edition (SA22-7832-14) https://www.ibm.com/docs/en/module_1678991624569/pdf/SA22-7832-14.pdf - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit See tests/asm-test/asm/portable-atomic for generated assembly. */ include!("macros.rs"); use core::{arch::asm, sync::atomic::Ordering}; use crate::utils::{Pair, U128}; // bcr 14,0 requires fast-BCR-serialization facility added in arch9 (z196). #[cfg(any( target_feature = "fast-serialization", portable_atomic_target_feature = "fast-serialization", ))] macro_rules! serialization { () => { "bcr 14, 0" }; } #[cfg(not(any( target_feature = "fast-serialization", portable_atomic_target_feature = "fast-serialization", )))] macro_rules! serialization { () => { "bcr 15, 0" }; } // Use distinct operands on z196 or later, otherwise split to lgr and $op. #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))] macro_rules! distinct_op { ($op:tt, $a0:tt, $a1:tt, $a2:tt) => { concat!($op, "k ", $a0, ", ", $a1, ", ", $a2) }; } #[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))] macro_rules! distinct_op { ($op:tt, $a0:tt, $a1:tt, $a2:tt) => { concat!("lgr ", $a0, ", ", $a1, "\n", $op, " ", $a0, ", ", $a2) }; } // Use selgr$cond on z15 or later, otherwise split to locgr$cond and $op. #[cfg(any( target_feature = "miscellaneous-extensions-3", portable_atomic_target_feature = "miscellaneous-extensions-3", ))] #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] macro_rules! select_op { ($cond:tt, $a0:tt, $a1:tt, $a2:tt) => { concat!("selgr", $cond, " ", $a0, ", ", $a1, ", ", $a2) }; } #[cfg(not(any( target_feature = "miscellaneous-extensions-3", portable_atomic_target_feature = "miscellaneous-extensions-3", )))] #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] macro_rules! select_op { ($cond:tt, $a0:tt, $a1:tt, $a2:tt) => { concat!("lgr ", $a0, ", ", $a2, "\n", "locgr", $cond, " ", $a0, ", ", $a1) }; } // Extracts and checks condition code. #[inline] fn extract_cc(r: i64) -> bool { r.wrapping_add(-268435456) & (1 << 31) != 0 } #[inline] unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 { debug_assert!(src as usize % 16 == 0); let (out_hi, out_lo); // SAFETY: the caller must uphold the safety contract. unsafe { // atomic load is always SeqCst. asm!( "lpq %r0, 0({src})", // atomic { r0:r1 = *src } src = in(reg) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") out_hi, out("r1") out_lo, options(nostack, preserves_flags), ); U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole } } #[inline] unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { debug_assert!(dst as usize % 16 == 0); let val = U128 { whole: val }; // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! atomic_store { ($acquire:expr) => { asm!( "stpq %r0, 0({dst})", // atomic { *dst = r0:r1 } $acquire, // fence dst = in(reg) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. in("r0") val.pair.hi, in("r1") val.pair.lo, options(nostack, preserves_flags), ) }; } match order { // Relaxed and Release stores are equivalent. Ordering::Relaxed | Ordering::Release => atomic_store!(""), Ordering::SeqCst => atomic_store!(serialization!()), _ => unreachable!(), } } } #[inline] unsafe fn atomic_compare_exchange( dst: *mut u128, old: u128, new: u128, _success: Ordering, _failure: Ordering, ) -> Result { debug_assert!(dst as usize % 16 == 0); let old = U128 { whole: old }; let new = U128 { whole: new }; let (prev_hi, prev_lo); let r; // SAFETY: the caller must uphold the safety contract. let prev = unsafe { // atomic CAS is always SeqCst. asm!( "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:13 } else { cc = 1; r0:r1 = *dst } } "ipm {r}", // r[:] = cc dst = in(reg) ptr_reg!(dst), r = lateout(reg) r, // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. inout("r0") old.pair.hi => prev_hi, inout("r1") old.pair.lo => prev_lo, in("r12") new.pair.hi, in("r13") new.pair.lo, // Do not use `preserves_flags` because CDSG modifies the condition code. options(nostack), ); U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole }; if extract_cc(r) { Ok(prev) } else { Err(prev) } } // cdsg is always strong. use self::atomic_compare_exchange as atomic_compare_exchange_weak; // 128-bit atomic load by two 64-bit atomic loads. #[cfg(not(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", )))] #[inline] unsafe fn byte_wise_atomic_load(src: *const u128) -> u128 { // SAFETY: the caller must uphold the safety contract. unsafe { let (out_hi, out_lo); asm!( "lg {out_hi}, 8({src})", // atomic { out_hi = *src.byte_add(8) } "lg {out_lo}, 0({src})", // atomic { out_lo = *src } src = in(reg) src, out_hi = out(reg) out_hi, out_lo = out(reg) out_lo, options(pure, nostack, preserves_flags, readonly), ); U128 { pair: Pair { hi: out_hi, lo: out_lo } }.whole } } #[cfg(not(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", )))] #[inline(always)] unsafe fn atomic_update(dst: *mut u128, order: Ordering, mut f: F) -> u128 where F: FnMut(u128) -> u128, { // SAFETY: the caller must uphold the safety contract. unsafe { // This is not single-copy atomic reads, but this is ok because subsequent // CAS will check for consistency. // // Note that the C++20 memory model does not allow mixed-sized atomic access, // so we must use inline assembly to implement byte_wise_atomic_load. // (i.e., byte-wise atomic based on the standard library's atomic types // cannot be used here). let mut prev = byte_wise_atomic_load(dst); loop { let next = f(prev); match atomic_compare_exchange_weak(dst, prev, next, order, Ordering::Relaxed) { Ok(x) => return x, Err(x) => prev = x, } } } } #[inline] unsafe fn atomic_swap(dst: *mut u128, val: u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); let val = U128 { whole: val }; let (mut prev_hi, mut prev_lo); // SAFETY: the caller must uphold the safety contract. // // We could use atomic_update here, but using an inline assembly allows omitting // the comparison of results and the storing/comparing of condition flags. // // Do not use atomic_rmw_cas_3 because it needs extra LGR to implement swap. unsafe { // atomic swap is always SeqCst. asm!( "lg %r0, 8({dst})", // atomic { r0 = *dst.byte_add(8) } "lg %r1, 0({dst})", // atomic { r1 = *dst } "2:", // 'retry: "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") prev_hi, out("r1") prev_lo, in("r12") val.pair.hi, in("r13") val.pair.lo, // Do not use `preserves_flags` because CDSG modifies the condition code. options(nostack), ); U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole } } /// Atomic RMW by CAS loop (3 arguments) /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;` /// /// `$op` can use the following registers: /// - val_hi/val_lo pair: val argument (read-only for `$op`) /// - r0/r1 pair: previous value loaded (read-only for `$op`) /// - r12/r13 pair: new value that will be stored // We could use atomic_update here, but using an inline assembly allows omitting // the comparison of results and the storing/comparing of condition flags. macro_rules! atomic_rmw_cas_3 { ($name:ident, [$($reg:tt)*], $($op:tt)*) => { #[inline] unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); let val = U128 { whole: val }; let (mut prev_hi, mut prev_lo); // SAFETY: the caller must uphold the safety contract. unsafe { // atomic RMW is always SeqCst. asm!( "lg %r0, 8({dst})", // atomic { r0 = *dst.byte_add(8) } "lg %r1, 0({dst})", // atomic { r1 = *dst } "2:", // 'retry: $($op)* "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), val_hi = in(reg) val.pair.hi, val_lo = in(reg) val.pair.lo, $($reg)* // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") prev_hi, out("r1") prev_lo, out("r12") _, out("r13") _, // Do not use `preserves_flags` because CDSG modifies the condition code. options(nostack), ); U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole } } }; } /// Atomic RMW by CAS loop (2 arguments) /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;` /// /// `$op` can use the following registers: /// - r0/r1 pair: previous value loaded (read-only for `$op`) /// - r12/r13 pair: new value that will be stored // We could use atomic_update here, but using an inline assembly allows omitting // the comparison of results and the storing/comparing of condition flags. macro_rules! atomic_rmw_cas_2 { ($name:ident, [$($reg:tt)*], $($op:tt)*) => { #[inline] unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 { debug_assert!(dst as usize % 16 == 0); let (mut prev_hi, mut prev_lo); // SAFETY: the caller must uphold the safety contract. unsafe { // atomic RMW is always SeqCst. asm!( "lg %r0, 8({dst})", // atomic { r0 = *dst.byte_add(8) } "lg %r1, 0({dst})", // atomic { r1 = *dst } "2:", // 'retry: $($op)* "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:r13 } else { cc = 1; r0:r1 = *dst } } "jl 2b", // if cc == 1 { jump 'retry } dst = in(reg) ptr_reg!(dst), $($reg)* // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. out("r0") prev_hi, out("r1") prev_lo, out("r12") _, out("r13") _, // Do not use `preserves_flags` because CDSG modifies the condition code. options(nostack), ); U128 { pair: Pair { hi: prev_hi, lo: prev_lo } }.whole } } }; } atomic_rmw_cas_3! { atomic_add, [], distinct_op!("algr", "%r13", "%r1", "{val_lo}"), // r13 = r1 + val_lo; cc = zero | carry "lgr %r12, %r0", // r12 = r0 "alcgr %r12, {val_hi}", // r12 += val_hi + carry } atomic_rmw_cas_3! { atomic_sub, [], distinct_op!("slgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 - val_lo; cc = zero | borrow "lgr %r12, %r0", // r12 = r0 "slbgr %r12, {val_hi}", // r12 -= val_hi + borrow } atomic_rmw_cas_3! { atomic_and, [], distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi } // Use nngrk on z15 or later. #[cfg(any( target_feature = "miscellaneous-extensions-3", portable_atomic_target_feature = "miscellaneous-extensions-3", ))] atomic_rmw_cas_3! { atomic_nand, [], "nngrk %r13, %r1, {val_lo}", // r13 = !(r1 & val_lo) "nngrk %r12, %r0, {val_hi}", // r12 = !(r0 & val_hi) } #[cfg(not(any( target_feature = "miscellaneous-extensions-3", portable_atomic_target_feature = "miscellaneous-extensions-3", )))] atomic_rmw_cas_3! { atomic_nand, [], distinct_op!("ngr", "%r13", "%r1", "{val_lo}"), // r13 = r1 & val_lo distinct_op!("ngr", "%r12", "%r0", "{val_hi}"), // r12 = r0 & val_hi "lcgr %r13, %r13", // r13 = !r13 + 1 "aghi %r13, -1", // r13 -= 1 "lcgr %r12, %r12", // r12 = !r12 + 1 "aghi %r12, -1", // r12 -= 1 } atomic_rmw_cas_3! { atomic_or, [], distinct_op!("ogr", "%r13", "%r1", "{val_lo}"), // r13 = r1 | val_lo distinct_op!("ogr", "%r12", "%r0", "{val_hi}"), // r12 = r0 | val_hi } atomic_rmw_cas_3! { atomic_xor, [], distinct_op!("xgr", "%r13", "%r1", "{val_lo}"), // r13 = r1 ^ val_lo distinct_op!("xgr", "%r12", "%r0", "{val_hi}"), // r12 = r0 ^ val_hi } #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] atomic_rmw_cas_3! { atomic_max, [], "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } select_op!("h", "%r12", "%r1", "{val_lo}"), // if cc == 2 { r12 = r1 } else { r12 = val_lo } "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo } "locgre %r13, %r12", // if cc == 0 { r13 = r12 } select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi } } #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] atomic_rmw_cas_3! { atomic_umax, [tmp = out(reg) _,], "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } select_op!("h", "{tmp}", "%r1", "{val_lo}"), // if cc == 2 { tmp = r1 } else { tmp = val_lo } "clgr %r0, {val_hi}", // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 } select_op!("h", "%r12", "%r0", "{val_hi}"), // if cc == 2 { r12 = r0 } else { r12 = val_hi } select_op!("h", "%r13", "%r1", "{val_lo}"), // if cc == 2 { r13 = r1 } else { r13 = val_lo } "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } "locgre %r13, {tmp}", // if cc == 0 { r13 = tmp } } #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] atomic_rmw_cas_3! { atomic_min, [], "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } select_op!("l", "%r12", "%r1", "{val_lo}"), // if cc == 1 { r12 = r1 } else { r12 = val_lo } "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo } "locgre %r13, %r12", // if cc == 0 { r13 = r12 } select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi } } #[cfg(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", ))] atomic_rmw_cas_3! { atomic_umin, [tmp = out(reg) _,], "clgr %r1, {val_lo}", // if r1(u) < val_lo(u) { cc = 1 } else if r1(u) > val_lo(u) { cc = 2 } else { cc = 0 } select_op!("l", "{tmp}", "%r1", "{val_lo}"), // if cc == 1 { tmp = r1 } else { tmp = val_lo } "clgr %r0, {val_hi}", // if r0(u) < val_hi(u) { cc = 1 } else if r0(u) > val_hi(u) { cc = 2 } else { cc = 0 } select_op!("l", "%r12", "%r0", "{val_hi}"), // if cc == 1 { r12 = r0 } else { r12 = val_hi } select_op!("l", "%r13", "%r1", "{val_lo}"), // if cc == 1 { r13 = r1 } else { r13 = val_lo } "cgr %r0, {val_hi}", // if r0(i) < val_hi(i) { cc = 1 } else if r0(i) > val_hi(i) { cc = 2 } else { cc = 0 } "locgre %r13, {tmp}", // if cc == 0 { r13 = tmp } } // We use atomic_update for atomic min/max on pre-z196 because // z10 doesn't seem to have a good way to implement 128-bit min/max. // loc{,g}r requires z196 or later. // https://godbolt.org/z/EqoMEP8b3 #[cfg(not(any( target_feature = "load-store-on-cond", portable_atomic_target_feature = "load-store-on-cond", )))] atomic_rmw_by_atomic_update!(cmp); atomic_rmw_cas_2! { atomic_not, [], "lcgr %r13, %r1", // r13 = !r1 + 1 "aghi %r13, -1", // r13 -= 1 "lcgr %r12, %r0", // r12 = !r0 + 1 "aghi %r12, -1", // r12 -= 1 } #[cfg(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops"))] atomic_rmw_cas_2! { atomic_neg, [zero = in(reg) 0_u64,], "slgrk %r13, {zero}, %r1", // r13 = 0 - r1; cc = zero | borrow "lghi %r12, 0", // r12 = 0 "slbgr %r12, %r0", // r12 -= r0 + borrow } #[cfg(not(any(target_feature = "distinct-ops", portable_atomic_target_feature = "distinct-ops")))] atomic_rmw_cas_2! { atomic_neg, [], "lghi %r13, 0", // r13 = 0 "slgr %r13, %r1", // r13 -= r1; cc = zero | borrow "lghi %r12, 0", // r12 = 0 "slbgr %r12, %r0", // r12 -= r0 + borrow } #[inline] const fn is_lock_free() -> bool { IS_ALWAYS_LOCK_FREE } const IS_ALWAYS_LOCK_FREE: bool = true; atomic128!(AtomicI128, i128, atomic_max, atomic_min); atomic128!(AtomicU128, u128, atomic_umax, atomic_umin); #[cfg(test)] mod tests { use super::*; test_atomic_int!(i128); test_atomic_int!(u128); // load/store/swap implementation is not affected by signedness, so it is // enough to test only unsigned types. stress_test!(u128); }