Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ env:
RUSTFLAGS: -D warnings
RUSTUP_MAX_RETRIES: '10'

aarch64_linux_task:
aarch64_linux_test_task:
name: test (aarch64-unknown-linux-gnu)
env:
TARGET: aarch64-unknown-linux-gnu
Expand All @@ -28,7 +28,7 @@ aarch64_linux_task:
- RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET
# TODO: lse2 is not available on Graviton2 (armv8.2-a)

aarch64_macos_task:
aarch64_macos_test_task:
name: test (aarch64-apple-darwin)
env:
TARGET: aarch64-apple-darwin
Expand All @@ -43,7 +43,7 @@ aarch64_macos_task:
# Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24
- CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET

valgrind_task:
aarch64_linux_valgrind_task:
name: valgrind (aarch64-unknown-linux-gnu)
env:
CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER: valgrind -v --error-exitcode=1 --error-limit=no --leak-check=full --show-leak-kinds=all --track-origins=yes
Expand Down Expand Up @@ -80,3 +80,30 @@ valgrind_task:
# Use -Z build-std because the prebuilt libtest seems to be incompatible with LTO, causing miscompilation: https://gist.github.com/taiki-e/9713f8e02e8f9f852ccee8d6f089ec24
- RUSTFLAGS="$RUSTFLAGS -C target-feature=+lse" RUSTDOCFLAGS="$RUSTDOCFLAGS -C target-feature=+lse" CARGO_PROFILE_RELEASE_CODEGEN_UNITS=1 CARGO_PROFILE_RELEASE_LTO=fat cargo -Z build-std test -vv --workspace --exclude bench --all-features --release --tests --target $TARGET
# TODO: lse2 is not available on Graviton2 (armv8.2-a)

# aarch64_linux_bench_task:
# name: bench (aarch64-unknown-linux-gnu)
# env:
# TARGET: aarch64-unknown-linux-gnu
# arm_container:
# image: rust:latest
# cpu: 4
# memory: 12G
# setup_script:
# - rustup toolchain add nightly && rustup default nightly
# test_script:
# - cargo bench -vv --manifest-path bench/Cargo.toml
# - RUSTFLAGS="${RUSTFLAGS} -C target-feature=+lse" cargo bench -vv --manifest-path bench/Cargo.toml

# aarch64_macos_bench_task:
# name: bench (aarch64-apple-darwin)
# env:
# TARGET: aarch64-apple-darwin
# macos_instance:
# image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
# setup_script:
# - curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain nightly --component rust-src
# test_script:
# - sysctl -a | grep machdep.cpu
# - source $HOME/.cargo/env
# - cargo bench -vv --manifest-path bench/Cargo.toml
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Note: In this file, do not use the hard wrap in the middle of a sentence for com

## [Unreleased]

- Optimize aarch64 128-bit load. ([#20](https://github.com/taiki-e/portable-atomic/pull/20))

## [0.3.9] - 2022-08-03

- Fix build error on old Miri.
Expand Down
24 changes: 24 additions & 0 deletions bench/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,27 @@ fn bench_concurrent_store_swap<A: AtomicInt<T>, T: Copy + From<u32>>() -> A {
});
a
}
fn bench_concurrent_fetch_add<A: AtomicInt<T>, T: Copy + From<u32>>() -> A {
let a = black_box(A::new(T::from(1)));
let barrier = Barrier::new(THREADS * 2);
thread::scope(|s| {
for _ in 0..THREADS {
s.spawn(|| {
barrier.wait();
for i in 0..N {
let _ = black_box(a.fetch_add(T::from(i)));
}
});
s.spawn(|| {
barrier.wait();
for i in (0..N).rev() {
let _ = black_box(a.fetch_add(T::from(i)));
}
});
}
});
a
}

macro_rules! benches {
($name:ident, $atomic_u128:path) => {
Expand Down Expand Up @@ -273,6 +294,9 @@ macro_rules! benches {
g.bench_function("u128_concurrent_store_swap", |b| {
b.iter(bench_concurrent_store_swap::<A, u128>);
});
g.bench_function("u128_concurrent_fetch_add", |b| {
b.iter(bench_concurrent_fetch_add::<A, u128>);
});
}
};
}
Expand Down
24 changes: 17 additions & 7 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
// - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
//
// Generated asm:
// - aarch64 https://godbolt.org/z/x6qMff94K
// - aarch64 (+lse) https://godbolt.org/z/9MsnrhEKr
// - aarch64 (+lse,+lse2) https://godbolt.org/z/orhr74bqT
// - aarch64 https://godbolt.org/z/85YK7K7Ye
// - aarch64 (+lse) https://godbolt.org/z/neWhfadn4
// - aarch64 (+lse,+lse2) https://godbolt.org/z/nGazzqx9e

include!("macros.rs");

Expand Down Expand Up @@ -196,14 +196,16 @@ unsafe fn _casp(dst: *mut u128, old: u128, new: u128, order: Ordering) -> u128 {
}
}

// If CPU supports FEAT_LSE2, LDP is single-copy atomic reads,
// otherwise it is two single-copy atomic reads.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))]
#[inline]
unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 {
debug_assert!(src as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for reads,
// 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE2.
// 16-byte aligned, that there are no concurrent non-atomic operations.
//
// Refs:
// - LDP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/LDP
Expand Down Expand Up @@ -231,14 +233,16 @@ unsafe fn _ldp(src: *mut u128, order: Ordering) -> u128 {
}
}

// If CPU supports FEAT_LSE2, STP is single-copy atomic writes,
// otherwise it is two single-copy atomic writes.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))]
#[inline]
unsafe fn _stp(dst: *mut u128, val: u128, order: Ordering) {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for writes,
// 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE2.
// 16-byte aligned, that there are no concurrent non-atomic operations.
//
// Refs:
// - STP: https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/STP
Expand Down Expand Up @@ -388,6 +392,12 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
// cfg guarantee that the CPU supports FEAT_LSE2.
() => unsafe { _ldp(src, order) },
#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
// cfg guarantee that the CPU supports FEAT_LSE.
() => unsafe { _casp(src, 0, 0, order) },
#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract for `atomic_load`.
() => unsafe { _compare_exchange_ldxp_stxp(src, 0, 0, order) },
Expand Down