From 1eb3f5d6f332ebb8d84aea594bea9d4d5f0ffa72 Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Thu, 4 Aug 2022 01:49:12 +0900 Subject: [PATCH] aarch64: Use CASP instead of LDXP/STXP for load if available --- .cirrus.yml | 33 ++++++++++++++++++++++++++++++--- CHANGELOG.md | 2 ++ bench/benches/bench.rs | 24 ++++++++++++++++++++++++ src/imp/atomic128/aarch64.rs | 24 +++++++++++++++++------- 4 files changed, 73 insertions(+), 10 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 192223cd6..0431fe502 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -8,7 +8,7 @@ env: RUSTFLAGS: -D warnings RUSTUP_MAX_RETRIES: '10' -aarch64_linux_task: +aarch64_linux_test_task: name: test (aarch64-unknown-linux-gnu) env: TARGET: aarch64-unknown-linux-gnu @@ -28,7 +28,7 @@ aarch64_linux_task: - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET # TODO: lse2 is not available on Graviton2 (armv8.2-a) -aarch64_macos_task: +aarch64_macos_test_task: name: test (aarch64-apple-darwin) env: TARGET: aarch64-apple-darwin @@ -43,7 +43,7 @@ aarch64_macos_task: # Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24 - CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET -valgrind_task: +aarch64_linux_valgrind_task: name: valgrind (aarch64-unknown-linux-gnu) env: CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER: valgrind -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes @@ -80,3 +80,30 @@ valgrind_task: # Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24 - RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET # TODO: lse2 is not available on Graviton2 (armv8.2-a) + +# aarch64_linux_bench_task: +# name: bench (aarch64-unknown-linux-gnu) +# env: +# TARGET: aarch64-unknown-linux-gnu +# arm_container: +# image: rust:latest +# cpu: 4 +# memory: 12G +# setup_script: +# - rustup toolchain add nightly && rustup default nightly +# test_script: +# - cargo bench -vv --manifest-path bench/Cargo.toml +# - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+lse" cargo bench -vv --manifest-path bench/Cargo.toml + +# aarch64_macos_bench_task: +# name: bench (aarch64-apple-darwin) +# env: +# TARGET: aarch64-apple-darwin +# macos_instance: +# image: ghcr.io/cirruslabs/macos-monterey-xcode:latest +# setup_script: +# - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain nightly --component rust-src +# test_script: +# - sysctl -a | grep machdep.cpu +# - source $HOME/.cargo/env +# - cargo bench -vv --manifest-path bench/Cargo.toml diff --git a/CHANGELOG.md b/CHANGELOG.md index 54dcecceb..bcd275ee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Note: In this file, do not use the hard wrap in the middle of a sentence for com ## [Unreleased] +- Optimize aarch64 128-bit load. ([#20](https://github.com/taiki-e/portable-atomic/pull/20)) + ## [0.3.9] - 2022-08-03 - Fix build error on old Miri. diff --git a/bench/benches/bench.rs b/bench/benches/bench.rs index ecc67f3b8..ace1b790a 100644 --- a/bench/benches/bench.rs +++ b/bench/benches/bench.rs @@ -233,6 +233,27 @@ fn bench_concurrent_store_swap, T: Copy + From>() -> A { }); a } +fn bench_concurrent_fetch_add, T: Copy + From>() -> A { + let a = black_box(A::new(T::from(1))); + let barrier = Barrier::new(THREADS * 2); + thread::scope(|s| { + for _ in 0..THREADS { + s.spawn(|| { + barrier.wait(); + for i in 0..N { + let _ = black_box(a.fetch_add(T::from(i))); + } + }); + s.spawn(|| { + barrier.wait(); + for i in (0..N).rev() { + let _ = black_box(a.fetch_add(T::from(i))); + } + }); + } + }); + a +} macro_rules! benches { ($name:ident, $atomic_u128:path) => { @@ -273,6 +294,9 @@ macro_rules! benches { g.bench_function("u128_concurrent_store_swap", |b| { b.iter(bench_concurrent_store_swap::); }); + g.bench_function("u128_concurrent_fetch_add", |b| { + b.iter(bench_concurrent_fetch_add::); + }); } }; } diff --git a/src/imp/atomic128/aarch64.rs b/src/imp/atomic128/aarch64.rs index bfb0fb3da..f602e773c 100644 --- a/src/imp/atomic128/aarch64.rs +++ b/src/imp/atomic128/aarch64.rs @@ -25,9 +25,9 @@ // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit // // Generated asm: -// - aarch64 https://godbolt.org/z/x6qMff94K -// - aarch64 (+lse) https://godbolt.org/z/9MsnrhEKr -// - aarch64 (+lse,+lse2) https://godbolt.org/z/orhr74bqT +// - aarch64 https://godbolt.org/z/85YK7K7Ye +// - aarch64 (+lse) https://godbolt.org/z/neWhfadn4 +// - aarch64 (+lse,+lse2) https://godbolt.org/z/nGazzqx9e include!("macros.rs"); @@ -196,14 +196,16 @@ unsafe fn _casp(dst: *mut u128, old: u128, new: u128, order: Ordering) -> u128 { } } +// If CPU supports FEAT_LSE2, LDP is single-copy atomic reads, +// otherwise it is two single-copy atomic reads. +// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile #[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))] #[inline] unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 { debug_assert!(src as usize % 16 == 0); // SAFETY: the caller must guarantee that `dst` is valid for reads, - // 16-byte aligned, that there are no concurrent non-atomic operations, - // and the CPU supports FEAT_LSE2. + // 16-byte aligned, that there are no concurrent non-atomic operations. // // Refs: // - LDP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/LDP @@ -231,14 +233,16 @@ unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 { } } +// If CPU supports FEAT_LSE2, STP is single-copy atomic writes, +// otherwise it is two single-copy atomic writes. +// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile #[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))] #[inline] unsafe fn _stp(dst: *mut u128, val: u128, order: Ordering) { debug_assert!(dst as usize % 16 == 0); // SAFETY: the caller must guarantee that `dst` is valid for writes, - // 16-byte aligned, that there are no concurrent non-atomic operations, - // and the CPU supports FEAT_LSE2. + // 16-byte aligned, that there are no concurrent non-atomic operations. // // Refs: // - STP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/STP @@ -388,6 +392,12 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 { // SAFETY: the caller must uphold the safety contract for `atomic_load`. // cfg guarantee that the CPU supports FEAT_LSE2. () => unsafe { _ldp(src, order) }, + #[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))] + #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] + // SAFETY: the caller must uphold the safety contract for `atomic_load`. + // cfg guarantee that the CPU supports FEAT_LSE. + () => unsafe { _casp(src, 0, 0, order) }, + #[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))] #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] // SAFETY: the caller must uphold the safety contract for `atomic_load`. () => unsafe { _compare_exchange_ldxp_stxp(src, 0, 0, order) },