From 5a4083b27ceb11bcd260daf28a0a436d6f1872b6 Mon Sep 17 00:00:00 2001 From: beetrees Date: Sun, 13 Oct 2024 17:51:19 +0100 Subject: [PATCH 1/2] Add I128 atomic support to the `x64` backend --- cranelift/codegen/meta/src/isa/x86.rs | 61 +- .../codegen/meta/src/shared/instructions.rs | 35 +- cranelift/codegen/src/isa/x64/inst.isle | 107 ++++ cranelift/codegen/src/isa/x64/inst/args.rs | 1 + cranelift/codegen/src/isa/x64/inst/emit.rs | 209 ++++++ .../codegen/src/isa/x64/inst/emit_tests.rs | 84 +++ cranelift/codegen/src/isa/x64/inst/mod.rs | 116 ++++ cranelift/codegen/src/isa/x64/lower.isle | 24 +- cranelift/codegen/src/isa/x64/lower/isle.rs | 19 + cranelift/codegen/src/isa/x64/pcc.rs | 40 ++ .../filetests/isa/x64/atomic-128.clif | 600 ++++++++++++++++++ .../filetests/runtests/atomic-128.clif | 274 ++++++++ cranelift/native/src/lib.rs | 3 + .../src/generators/codegen_settings.rs | 1 + crates/wasmtime/src/config.rs | 1 + crates/wasmtime/src/engine.rs | 1 + 16 files changed, 1540 insertions(+), 36 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/atomic-128.clif create mode 100644 cranelift/filetests/filetests/runtests/atomic-128.clif diff --git a/cranelift/codegen/meta/src/isa/x86.rs b/cranelift/codegen/meta/src/isa/x86.rs index 18f80b067394..053d15c8f357 100644 --- a/cranelift/codegen/meta/src/isa/x86.rs +++ b/cranelift/codegen/meta/src/isa/x86.rs @@ -17,6 +17,12 @@ pub(crate) fn define() -> TargetIsa { "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false, ); + let has_cmpxchg16b = settings.add_bool( + "has_cmpxchg16b", + "Has support for CMPXCHG16b.", + "CMPXCHG16b: CPUID.01H:ECX.CMPXCHG16B[bit 13]", + false, + ); let has_sse41 = settings.add_bool( "has_sse41", "Has support for SSE4.1.", @@ -106,6 +112,7 @@ pub(crate) fn define() -> TargetIsa { false, ); + settings.add_predicate("use_cmpxchg16b", predicate!(has_cmpxchg16b)); settings.add_predicate("use_ssse3", predicate!(has_ssse3)); settings.add_predicate("use_sse41", predicate!(has_sse41)); settings.add_predicate("use_sse42", predicate!(has_sse41 && has_sse42)); @@ -141,14 +148,30 @@ pub(crate) fn define() -> TargetIsa { // Intel CPUs // Netburst - settings.add_preset("nocona", "Nocona microarchitecture.", preset!(sse3)); + settings.add_preset( + "nocona", + "Nocona microarchitecture.", + preset!(sse3 && has_cmpxchg16b), + ); // Intel Core 2 Solo/Duo - settings.add_preset("core2", "Core 2 microarchitecture.", preset!(sse3)); - settings.add_preset("penryn", "Penryn microarchitecture.", preset!(sse41)); + settings.add_preset( + "core2", + "Core 2 microarchitecture.", + preset!(sse3 && has_cmpxchg16b), + ); + settings.add_preset( + "penryn", + "Penryn microarchitecture.", + preset!(sse41 && has_cmpxchg16b), + ); // Intel Atom CPUs - let atom = settings.add_preset("atom", "Atom microarchitecture.", preset!(ssse3)); + let atom = settings.add_preset( + "atom", + "Atom microarchitecture.", + preset!(ssse3 && has_cmpxchg16b), + ); settings.add_preset("bonnell", "Bonnell microarchitecture.", preset!(atom)); let silvermont = settings.add_preset( "silvermont", @@ -186,7 +209,7 @@ pub(crate) fn define() -> TargetIsa { let nehalem = settings.add_preset( "nehalem", "Nehalem microarchitecture.", - preset!(sse42 && has_popcnt), + preset!(sse42 && has_popcnt && has_cmpxchg16b), ); settings.add_preset("corei7", "Core i7 microarchitecture.", preset!(nehalem)); let westmere = settings.add_preset("westmere", "Westmere microarchitecture.", preset!(nehalem)); @@ -229,7 +252,15 @@ pub(crate) fn define() -> TargetIsa { let knights_landing = settings.add_preset( "knl", "Knights Landing microarchitecture.", - preset!(has_popcnt && has_avx512f && has_fma && has_bmi1 && has_bmi2 && has_lzcnt), + preset!( + has_popcnt + && has_avx512f + && has_fma + && has_bmi1 + && has_bmi2 + && has_lzcnt + && has_cmpxchg16b + ), ); settings.add_preset( "knm", @@ -312,22 +343,22 @@ pub(crate) fn define() -> TargetIsa { settings.add_preset( "opteron-sse3", "Opteron microarchitecture with support for SSE3 instructions.", - preset!(sse3), + preset!(sse3 && has_cmpxchg16b), ); settings.add_preset( "k8-sse3", "K8 Hammer microarchitecture with support for SSE3 instructions.", - preset!(sse3), + preset!(sse3 && has_cmpxchg16b), ); settings.add_preset( "athlon64-sse3", "Athlon 64 microarchitecture with support for SSE3 instructions.", - preset!(sse3), + preset!(sse3 && has_cmpxchg16b), ); let barcelona = settings.add_preset( "barcelona", "Barcelona microarchitecture.", - preset!(has_popcnt && has_lzcnt), + preset!(has_popcnt && has_lzcnt && has_cmpxchg16b), ); settings.add_preset( "amdfam10", @@ -338,7 +369,7 @@ pub(crate) fn define() -> TargetIsa { let btver1 = settings.add_preset( "btver1", "Bobcat microarchitecture.", - preset!(ssse3 && has_lzcnt && has_popcnt), + preset!(ssse3 && has_lzcnt && has_popcnt && has_cmpxchg16b), ); settings.add_preset( "btver2", @@ -349,7 +380,7 @@ pub(crate) fn define() -> TargetIsa { let bdver1 = settings.add_preset( "bdver1", "Bulldozer microarchitecture", - preset!(has_lzcnt && has_popcnt && ssse3), + preset!(has_lzcnt && has_popcnt && ssse3 && has_cmpxchg16b), ); let bdver2 = settings.add_preset( "bdver2", @@ -366,7 +397,9 @@ pub(crate) fn define() -> TargetIsa { let znver1 = settings.add_preset( "znver1", "Zen (first generation) microarchitecture.", - preset!(sse42 && has_popcnt && has_bmi1 && has_bmi2 && has_lzcnt && has_fma), + preset!( + sse42 && has_popcnt && has_bmi1 && has_bmi2 && has_lzcnt && has_fma && has_cmpxchg16b + ), ); let znver2 = settings.add_preset( "znver2", @@ -397,7 +430,7 @@ pub(crate) fn define() -> TargetIsa { let x86_64_v2 = settings.add_preset( "x86-64-v2", "Generic x86-64 (V2) microarchitecture.", - preset!(sse42 && has_popcnt), + preset!(sse42 && has_popcnt && has_cmpxchg16b), ); let x86_64_v3 = settings.add_preset( "x84_64_v3", diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index d094a2850c05..4a47d5e0339a 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -3637,7 +3637,7 @@ pub(crate) fn define( let AtomicMem = &TypeVar::new( "AtomicMem", "Any type that can be stored in memory, which can be used in an atomic operation", - TypeSetBuilder::new().ints(8..64).build(), + TypeSetBuilder::new().ints(8..128).build(), ); ig.push( @@ -3645,10 +3645,11 @@ pub(crate) fn define( "atomic_rmw", r#" Atomically read-modify-write memory at `p`, with second operand `x`. The old value is - returned. `p` has the type of the target word size, and `x` may be an integer type of - 8, 16, 32 or 64 bits, even on a 32-bit target. The type of the returned value is the - same as the type of `x`. This operation is sequentially consistent and creates - happens-before edges that order normal (non-atomic) loads and stores. + returned. `p` has the type of the target word size, and `x` may be any integer type; note + that some targets require specific target features to be enabled in order to support 128-bit + integer atomics. The type of the returned value is the same as the type of `x`. This + operation is sequentially consistent and creates happens-before edges that order normal + (non-atomic) loads and stores. "#, &formats.atomic_rmw, ) @@ -3673,11 +3674,11 @@ pub(crate) fn define( Perform an atomic compare-and-swap operation on memory at `p`, with expected value `e`, storing `x` if the value at `p` equals `e`. The old value at `p` is returned, regardless of whether the operation succeeds or fails. `p` has the type of the target - word size, and `x` and `e` must have the same type and the same size, which may be an - integer type of 8, 16, 32 or 64 bits, even on a 32-bit target. The type of the returned - value is the same as the type of `x` and `e`. This operation is sequentially - consistent and creates happens-before edges that order normal (non-atomic) loads and - stores. + word size, and `x` and `e` must have the same type and the same size, which may be any + integer type; note that some targets require specific target features to be enabled in order + to support 128-bit integer atomics. The type of the returned value is the same as the type + of `x` and `e`. This operation is sequentially consistent and creates happens-before edges + that order normal (non-atomic) loads and stores. "#, &formats.atomic_cas, ) @@ -3702,9 +3703,10 @@ pub(crate) fn define( Atomically load from memory at `p`. This is a polymorphic instruction that can load any value type which has a memory - representation. It should only be used for integer types with 8, 16, 32 or 64 bits. - This operation is sequentially consistent and creates happens-before edges that order - normal (non-atomic) loads and stores. + representation. It can only be used for integer types; note that some targets require + specific target features to be enabled in order to support 128-bit integer atomics. This + operation is sequentially consistent and creates happens-before edges that order normal + (non-atomic) loads and stores. "#, &formats.load_no_offset, ) @@ -3726,9 +3728,10 @@ pub(crate) fn define( Atomically store `x` to memory at `p`. This is a polymorphic instruction that can store any value type with a memory - representation. It should only be used for integer types with 8, 16, 32 or 64 bits. - This operation is sequentially consistent and creates happens-before edges that order - normal (non-atomic) loads and stores. + representation. It can only be used for integer types; note that some targets require + specific target features to be enabled in order to support 128-bit integer atomics This + operation is sequentially consistent and creates happens-before edges that order normal + (non-atomic) loads and stores. "#, &formats.store_no_offset, ) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 43a7ef133a55..381f44be3645 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -664,6 +664,24 @@ (mem SyntheticAmode) (dst_old WritableReg)) + ;; A standard (native) `lock cmpxchg16b (amode)`, with register + ;; conventions: + ;; + ;; `mem` (read) address + ;; %rbx (low), %rcx (high) (read) replacement value + ;; %rax (low), %rdx (high) (modified) in: expected value, out: value that was actually at `dst` + ;; %rflags is written. Do not assume anything about it after the instruction. + ;; + ;; The instruction "succeeded" iff the bits of %rax and %rdx + ;; afterwards are the same as they were before. + (LockCmpxchg16b (replacement_low Reg) + (replacement_high Reg) + (expected_low Reg) + (expected_high Reg) + (mem BoxSyntheticAmode) + (dst_old_low WritableReg) + (dst_old_high WritableReg)) + ;; A synthetic instruction, based on a loop around a native `lock ;; cmpxchg` instruction. ;; @@ -696,6 +714,46 @@ (temp WritableReg) (dst_old WritableReg)) + ;; A synthetic instruction, based on a loop around a native `lock + ;; cmpxchg16b` instruction. + ;; + ;; This is the same as `AtomicRmwSeq`, but for 128-bit integers. + ;; + ;; For `MachAtomicRmwOp::Xchg`, use `Atomic128XchgSeq` instead. + ;; + ;; This instruction sequence has fixed register uses as follows: + ;; - %rax (low), %rdx (high) (written) the old value at `mem` + ;; - %rbx (low), %rcx (high) (written) used as temp registers to hold + ;; the replacement value + ;; - %rflags is written. Do not assume anything about it after the + ;; instruction. + (Atomic128RmwSeq (op MachAtomicRmwOp) + (mem BoxSyntheticAmode) + (operand_low Reg) + (operand_high Reg) + (temp_low WritableReg) + (temp_high WritableReg) + (dst_old_low WritableReg) + (dst_old_high WritableReg)) + + ;; A synthetic instruction, based on a loop around a native `lock + ;; cmpxchg16b` instruction. + ;; + ;; This is `Atomic128XchgSeq` but only for `MachAtomicRmwOp::Xchg`. As + ;; the replacement value is the same every time, this instruction doesn't + ;; require any temporary registers. + ;; + ;; This instruction sequence has fixed register uses as follows: + ;; - %rax (low), %rdx (high) (written) the old value at `mem` + ;; - %rbx (low), %rcx (high) (read) the replacement value + ;; - %rflags is written. Do not assume anything about it after the + ;; instruction. + (Atomic128XchgSeq (mem SyntheticAmode) + (operand_low Reg) + (operand_high Reg) + (dst_old_low WritableReg) + (dst_old_high WritableReg)) + ;; A memory fence (mfence, lfence or sfence). (Fence (kind FenceKind)) @@ -762,6 +820,11 @@ (type BoxCallIndInfo extern (enum)) (type BoxReturnCallInfo extern (enum)) (type BoxReturnCallIndInfo extern (enum)) +(type BoxSyntheticAmode extern (enum)) + +(decl pure box_synthetic_amode (SyntheticAmode) BoxSyntheticAmode) +(extern constructor box_synthetic_amode box_synthetic_amode) +(convert SyntheticAmode BoxSyntheticAmode box_synthetic_amode) ;; Get the `OperandSize` for a given `Type`, rounding smaller types up to 32 bits. (decl operand_size_of_type_32_64 (Type) OperandSize) @@ -1862,6 +1925,9 @@ (decl pure use_avx2 () bool) (extern constructor use_avx2 use_avx2) +(decl pure use_cmpxchg16b () bool) +(extern constructor use_cmpxchg16b use_cmpxchg16b) + ;;;; Helpers for Merging and Sinking Immediates/Loads ;;;;;;;;;;;;;;;;;;;;;;;;; ;; Extract a constant `Imm8Reg.Imm8` from a value operand. @@ -5214,6 +5280,17 @@ (_ Unit (emit (MInst.LockCmpxchg ty replacement expected addr dst)))) dst)) +(decl x64_cmpxchg16b (ValueRegs ValueRegs SyntheticAmode) ValueRegs) +(rule (x64_cmpxchg16b expected replacement addr) + (let ((expected_low Gpr (value_regs_get_gpr expected 0)) + (expected_high Gpr (value_regs_get_gpr expected 1)) + (replacement_low Gpr (value_regs_get_gpr replacement 0)) + (replacement_high Gpr (value_regs_get_gpr replacement 1)) + (dst_low WritableGpr (temp_writable_gpr)) + (dst_high WritableGpr (temp_writable_gpr)) + (_ Unit (emit (MInst.LockCmpxchg16b replacement_low replacement_high expected_low expected_high addr dst_low dst_high)))) + (value_regs dst_low dst_high))) + (decl x64_atomic_rmw_seq (Type MachAtomicRmwOp SyntheticAmode Gpr) Gpr) (rule (x64_atomic_rmw_seq ty op mem input) (let ((dst WritableGpr (temp_writable_gpr)) @@ -5221,6 +5298,36 @@ (_ Unit (emit (MInst.AtomicRmwSeq ty op mem input tmp dst)))) dst)) +(decl x64_atomic_128_rmw_seq (MachAtomicRmwOp SyntheticAmode ValueRegs) ValueRegs) +(rule (x64_atomic_128_rmw_seq op mem input) + (let ((dst_low WritableGpr (temp_writable_gpr)) + (dst_high WritableGpr (temp_writable_gpr)) + (tmp_low WritableGpr (temp_writable_gpr)) + (tmp_high WritableGpr (temp_writable_gpr)) + (input_low Gpr (value_regs_get_gpr input 0)) + (input_high Gpr (value_regs_get_gpr input 1)) + (_ Unit (emit (MInst.Atomic128RmwSeq op mem input_low input_high tmp_low tmp_high dst_low dst_high)))) + (value_regs dst_low dst_high))) + +(rule 1 (x64_atomic_128_rmw_seq (mach_atomic_rmw_op_xchg) mem input) + (let ((dst_low WritableGpr (temp_writable_gpr)) + (dst_high WritableGpr (temp_writable_gpr)) + (input_low Gpr (value_regs_get_gpr input 0)) + (input_high Gpr (value_regs_get_gpr input 1)) + (_ Unit (emit (MInst.Atomic128XchgSeq mem input_low input_high dst_low dst_high)))) + (value_regs dst_low dst_high))) + +(decl x64_atomic_128_store_seq (SyntheticAmode ValueRegs) SideEffectNoResult) +(rule (x64_atomic_128_store_seq mem input) + (let ((dst_low WritableGpr (temp_writable_gpr)) + (dst_high WritableGpr (temp_writable_gpr)) + (input_low Gpr (value_regs_get_gpr input 0)) + (input_high Gpr (value_regs_get_gpr input 1))) + (SideEffectNoResult.Inst (MInst.Atomic128XchgSeq mem input_low input_high dst_low dst_high)))) + +(decl mach_atomic_rmw_op_xchg () MachAtomicRmwOp) +(extern extractor mach_atomic_rmw_op_xchg mach_atomic_rmw_op_is_xchg) + ;; CLIF IR has one enumeration for atomic operations (`AtomicRmwOp`) while the ;; mach backend has another (`MachAtomicRmwOp`)--this converts one to the other. (type MachAtomicRmwOp extern (enum)) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index a6923e400ff8..7cb22c624909 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -959,6 +959,7 @@ pub enum CmpOpcode { pub(crate) enum InstructionSet { SSE, SSE2, + CMPXCHG16b, SSSE3, SSE41, SSE42, diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 2482300ed609..3b1031c8adbd 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -115,6 +115,7 @@ pub(crate) fn emit( match iset_requirement { // Cranelift assumes SSE2 at least. InstructionSet::SSE | InstructionSet::SSE2 => true, + InstructionSet::CMPXCHG16b => info.isa_flags.use_cmpxchg16b(), InstructionSet::SSSE3 => info.isa_flags.use_ssse3(), InstructionSet::SSE41 => info.isa_flags.use_sse41(), InstructionSet::SSE42 => info.isa_flags.use_sse42(), @@ -4037,6 +4038,38 @@ pub(crate) fn emit( emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0); } + Inst::LockCmpxchg16b { + replacement_low, + replacement_high, + expected_low, + expected_high, + mem, + dst_old_low, + dst_old_high, + } => { + let mem = mem.clone(); + debug_assert_eq!(*replacement_low, regs::rbx()); + debug_assert_eq!(*replacement_high, regs::rcx()); + debug_assert_eq!(*expected_low, regs::rax()); + debug_assert_eq!(*expected_high, regs::rdx()); + debug_assert_eq!(dst_old_low.to_reg(), regs::rax()); + debug_assert_eq!(dst_old_high.to_reg(), regs::rdx()); + + let amode = mem.finalize(state, sink); + // lock cmpxchg16b (mem) + // Note that 0xF0 is the Lock prefix. + emit_std_enc_mem( + sink, + LegacyPrefixes::_F0, + 0x0FC7, + 2, + 1, + &amode, + RexFlags::set_w(), + 0, + ); + } + Inst::AtomicRmwSeq { ty, op, @@ -4157,6 +4190,182 @@ pub(crate) fn emit( one_way_jmp(sink, CC::NZ, again_label); } + Inst::Atomic128RmwSeq { + op, + mem, + operand_low, + operand_high, + temp_low, + temp_high, + dst_old_low, + dst_old_high, + } => { + let operand_low = *operand_low; + let operand_high = *operand_high; + let temp_low = *temp_low; + let temp_high = *temp_high; + let dst_old_low = *dst_old_low; + let dst_old_high = *dst_old_high; + debug_assert_eq!(temp_low.to_reg(), regs::rbx()); + debug_assert_eq!(temp_high.to_reg(), regs::rcx()); + debug_assert_eq!(dst_old_low.to_reg(), regs::rax()); + debug_assert_eq!(dst_old_high.to_reg(), regs::rdx()); + let mem = mem.finalize(state, sink).clone(); + + let again_label = sink.get_label(); + + // Load the initial value. + Inst::load(types::I64, mem.clone(), dst_old_low, ExtKind::ZeroExtend) + .emit(sink, info, state); + Inst::load(types::I64, mem.offset(8), dst_old_high, ExtKind::ZeroExtend) + .emit(sink, info, state); + + // again: + sink.bind_label(again_label, state.ctrl_plane_mut()); + + // Move old value to temp registers. + Inst::mov_r_r(OperandSize::Size64, dst_old_low.to_reg(), temp_low) + .emit(sink, info, state); + Inst::mov_r_r(OperandSize::Size64, dst_old_high.to_reg(), temp_high) + .emit(sink, info, state); + + // Perform the operation. + let operand_low_rmi = RegMemImm::reg(operand_low); + let operand_high_rmi = RegMemImm::reg(operand_high); + use inst_common::MachAtomicRmwOp as RmwOp; + match op { + RmwOp::Xchg => panic!("use `Atomic128XchgSeq` instead"), + RmwOp::Nand => { + // temp &= operand + Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::And, + operand_low_rmi, + temp_low, + ) + .emit(sink, info, state); + Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::And, + operand_high_rmi, + temp_high, + ) + .emit(sink, info, state); + + // temp = !temp + Inst::not(OperandSize::Size64, temp_low).emit(sink, info, state); + Inst::not(OperandSize::Size64, temp_high).emit(sink, info, state); + } + RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => { + // Do a comparison with LHS temp and RHS operand. + // `cmp_rmi_r` and `alu_rmi_r` have opposite argument orders. + Inst::cmp_rmi_r(OperandSize::Size64, temp_low.to_reg(), operand_low_rmi) + .emit(sink, info, state); + // Thie will clobber `temp_high` + Inst::alu_rmi_r( + OperandSize::Size64, + AluRmiROpcode::Sbb, + operand_high_rmi, + temp_high, + ) + .emit(sink, info, state); + // Restore the clobbered value + Inst::mov_r_r(OperandSize::Size64, dst_old_high.to_reg(), temp_high) + .emit(sink, info, state); + let cc = match op { + RmwOp::Umin => CC::NB, + RmwOp::Umax => CC::B, + RmwOp::Smin => CC::NL, + RmwOp::Smax => CC::L, + _ => unreachable!(), + }; + Inst::cmove(OperandSize::Size64, cc, operand_low.into(), temp_low) + .emit(sink, info, state); + Inst::cmove(OperandSize::Size64, cc, operand_high.into(), temp_high) + .emit(sink, info, state); + } + _ => { + // temp op= operand + let (op_low, op_high) = match op { + RmwOp::Add => (AluRmiROpcode::Add, AluRmiROpcode::Adc), + RmwOp::Sub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb), + RmwOp::And => (AluRmiROpcode::And, AluRmiROpcode::And), + RmwOp::Or => (AluRmiROpcode::Or, AluRmiROpcode::Or), + RmwOp::Xor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor), + RmwOp::Xchg + | RmwOp::Nand + | RmwOp::Umin + | RmwOp::Umax + | RmwOp::Smin + | RmwOp::Smax => unreachable!(), + }; + Inst::alu_rmi_r(OperandSize::Size64, op_low, operand_low_rmi, temp_low) + .emit(sink, info, state); + Inst::alu_rmi_r(OperandSize::Size64, op_high, operand_high_rmi, temp_high) + .emit(sink, info, state); + } + } + + // cmpxchg16b (mem) + Inst::LockCmpxchg16b { + replacement_low: temp_low.to_reg(), + replacement_high: temp_high.to_reg(), + expected_low: dst_old_low.to_reg(), + expected_high: dst_old_high.to_reg(), + mem: Box::new(mem.into()), + dst_old_low, + dst_old_high, + } + .emit(sink, info, state); + + // jnz again + one_way_jmp(sink, CC::NZ, again_label); + } + + Inst::Atomic128XchgSeq { + mem, + operand_low, + operand_high, + dst_old_low, + dst_old_high, + } => { + let operand_low = *operand_low; + let operand_high = *operand_high; + let dst_old_low = *dst_old_low; + let dst_old_high = *dst_old_high; + debug_assert_eq!(operand_low, regs::rbx()); + debug_assert_eq!(operand_high, regs::rcx()); + debug_assert_eq!(dst_old_low.to_reg(), regs::rax()); + debug_assert_eq!(dst_old_high.to_reg(), regs::rdx()); + let mem = mem.finalize(state, sink).clone(); + + let again_label = sink.get_label(); + + // Load the initial value. + Inst::load(types::I64, mem.clone(), dst_old_low, ExtKind::ZeroExtend) + .emit(sink, info, state); + Inst::load(types::I64, mem.offset(8), dst_old_high, ExtKind::ZeroExtend) + .emit(sink, info, state); + + // again: + sink.bind_label(again_label, state.ctrl_plane_mut()); + + // cmpxchg16b (mem) + Inst::LockCmpxchg16b { + replacement_low: operand_low, + replacement_high: operand_high, + expected_low: dst_old_low.to_reg(), + expected_high: dst_old_high.to_reg(), + mem: Box::new(mem.into()), + dst_old_low, + dst_old_high, + } + .emit(sink, info, state); + + // jnz again + one_way_jmp(sink, CC::NZ, again_label); + } + Inst::Fence { kind } => { sink.put1(0x0F); sink.put1(0xAE); diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index 287cc3bf4cd4..1b52d074923a 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -4984,6 +4984,20 @@ fn test_x64_emit() { "lock cmpxchgq %r10, -12345(%rcx,%rsi,8), expected=%rax, dst_old=%rax", )); + insns.push(( + Inst::LockCmpxchg16b { + mem: Box::new(am2.clone()), + replacement_low: rbx, + replacement_high: rcx, + expected_low: rax, + expected_high: rdx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "F0480FC78CF1C7CFFFFF", + "lock cmpxchg16b -12345(%rcx,%rsi,8), replacement=%rcx:%rbx, expected=%rdx:%rax, dst_old=%rdx:%rax", + )); + // AtomicRmwSeq insns.push(( Inst::AtomicRmwSeq { @@ -5046,6 +5060,75 @@ fn test_x64_emit() { "atomically { 64_bits_at_[%r9]) Add= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }" )); + // Atomic128RmwSeq + insns.push(( + Inst::Atomic128RmwSeq { + op: inst_common::MachAtomicRmwOp::Or, + mem: Box::new(am3.clone()), + operand_low: r10, + operand_high: r11, + temp_low: w_rbx, + temp_high: w_rcx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "498B01498B51084889C34889D14C09D34C09D9F0490FC7090F85E9FFFFFF", + "atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Or %r11:%r10; 0(%r9) = %rcx:%rbx }", + )); + insns.push(( + Inst::Atomic128RmwSeq { + op: inst_common::MachAtomicRmwOp::And, + mem: Box::new(am3.clone()), + operand_low: r10, + operand_high: r11, + temp_low: w_rbx, + temp_high: w_rcx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "498B01498B51084889C34889D14C21D34C21D9F0490FC7090F85E9FFFFFF", + "atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax And %r11:%r10; 0(%r9) = %rcx:%rbx }" + )); + insns.push(( + Inst::Atomic128RmwSeq { + op: inst_common::MachAtomicRmwOp::Umin, + mem: Box::new(am3.clone()), + operand_low: r10, + operand_high: r11, + temp_low: w_rbx, + temp_high: w_rcx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "498B01498B51084889C34889D14C39D34C19D94889D1490F43DA490F43CBF0490FC7090F85DEFFFFFF", + "atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Umin %r11:%r10; 0(%r9) = %rcx:%rbx }" + )); + insns.push(( + Inst::Atomic128RmwSeq { + op: inst_common::MachAtomicRmwOp::Add, + mem: Box::new(am3.clone()), + operand_low: r10, + operand_high: r11, + temp_low: w_rbx, + temp_high: w_rcx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "498B01498B51084889C34889D14C01D34C11D9F0490FC7090F85E9FFFFFF", + "atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Add %r11:%r10; 0(%r9) = %rcx:%rbx }" + )); + insns.push(( + Inst::Atomic128XchgSeq { + mem: am3.clone(), + operand_low: rbx, + operand_high: rcx, + dst_old_low: w_rax, + dst_old_high: w_rdx, + }, + "498B01498B5108F0490FC7090F85F5FFFFFF", + "atomically { %rdx:%rax = 0(%r9); 0(%r9) = %rcx:%rbx }", + )); + // Fence insns.push(( Inst::Fence { @@ -5115,6 +5198,7 @@ fn test_x64_emit() { use crate::settings::Configurable; let mut isa_flag_builder = x64::settings::builder(); + isa_flag_builder.enable("has_cmpxchg16b").unwrap(); isa_flag_builder.enable("has_ssse3").unwrap(); isa_flag_builder.enable("has_sse41").unwrap(); isa_flag_builder.enable("has_fma").unwrap(); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index dc3f15a057e1..b522f44fd2e1 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -139,6 +139,10 @@ impl Inst { | Inst::DummyUse { .. } | Inst::AluConstOp { .. } => smallvec![], + Inst::LockCmpxchg16b { .. } + | Inst::Atomic128RmwSeq { .. } + | Inst::Atomic128XchgSeq { .. } => smallvec![InstructionSet::CMPXCHG16b], + Inst::AluRmRVex { op, .. } => op.available_from(), Inst::UnaryRmR { op, .. } => op.available_from(), Inst::UnaryRmRVex { op, .. } => op.available_from(), @@ -1815,6 +1819,28 @@ impl PrettyPrint for Inst { ) } + Inst::LockCmpxchg16b { + replacement_low, + replacement_high, + expected_low, + expected_high, + mem, + dst_old_low, + dst_old_high, + .. + } => { + let replacement_low = pretty_print_reg(*replacement_low, 8); + let replacement_high = pretty_print_reg(*replacement_high, 8); + let expected_low = pretty_print_reg(*expected_low, 8); + let expected_high = pretty_print_reg(*expected_high, 8); + let dst_old_low = pretty_print_reg(dst_old_low.to_reg(), 8); + let dst_old_high = pretty_print_reg(dst_old_high.to_reg(), 8); + let mem = mem.pretty_print(16); + format!( + "lock cmpxchg16b {mem}, replacement={replacement_high}:{replacement_low}, expected={expected_high}:{expected_low}, dst_old={dst_old_high}:{dst_old_low}" + ) + } + Inst::AtomicRmwSeq { ty, op, .. } => { let ty = ty.bits(); format!( @@ -1822,6 +1848,41 @@ impl PrettyPrint for Inst { ) } + Inst::Atomic128RmwSeq { + op, + mem, + operand_low, + operand_high, + temp_low, + temp_high, + dst_old_low, + dst_old_high, + } => { + let operand_low = pretty_print_reg(*operand_low, 8); + let operand_high = pretty_print_reg(*operand_high, 8); + let temp_low = pretty_print_reg(temp_low.to_reg(), 8); + let temp_high = pretty_print_reg(temp_high.to_reg(), 8); + let dst_old_low = pretty_print_reg(dst_old_low.to_reg(), 8); + let dst_old_high = pretty_print_reg(dst_old_high.to_reg(), 8); + let mem = mem.pretty_print(16); + format!("atomically {{ {dst_old_high}:{dst_old_low} = {mem}; {temp_high}:{temp_low} = {dst_old_high}:{dst_old_low} {op:?} {operand_high}:{operand_low}; {mem} = {temp_high}:{temp_low} }}") + } + + Inst::Atomic128XchgSeq { + mem, + operand_low, + operand_high, + dst_old_low, + dst_old_high, + } => { + let operand_low = pretty_print_reg(*operand_low, 8); + let operand_high = pretty_print_reg(*operand_high, 8); + let dst_old_low = pretty_print_reg(dst_old_low.to_reg(), 8); + let dst_old_high = pretty_print_reg(dst_old_high.to_reg(), 8); + let mem = mem.pretty_print(16); + format!("atomically {{ {dst_old_high}:{dst_old_low} = {mem}; {mem} = {operand_high}:{operand_low} }}") + } + Inst::Fence { kind } => match kind { FenceKind::MFence => "mfence".to_string(), FenceKind::LFence => "lfence".to_string(), @@ -2462,6 +2523,25 @@ fn x64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { mem.get_operands(collector); } + Inst::LockCmpxchg16b { + replacement_low, + replacement_high, + expected_low, + expected_high, + mem, + dst_old_low, + dst_old_high, + .. + } => { + collector.reg_fixed_use(replacement_low, regs::rbx()); + collector.reg_fixed_use(replacement_high, regs::rcx()); + collector.reg_fixed_use(expected_low, regs::rax()); + collector.reg_fixed_use(expected_high, regs::rdx()); + collector.reg_fixed_def(dst_old_low, regs::rax()); + collector.reg_fixed_def(dst_old_high, regs::rdx()); + mem.get_operands(collector); + } + Inst::AtomicRmwSeq { operand, temp, @@ -2477,6 +2557,42 @@ fn x64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { mem.get_operands_late(collector) } + Inst::Atomic128RmwSeq { + operand_low, + operand_high, + temp_low, + temp_high, + dst_old_low, + dst_old_high, + mem, + .. + } => { + // All registers are collected in the `Late` position so that they don't overlap. + collector.reg_late_use(operand_low); + collector.reg_late_use(operand_high); + collector.reg_fixed_def(temp_low, regs::rbx()); + collector.reg_fixed_def(temp_high, regs::rcx()); + collector.reg_fixed_def(dst_old_low, regs::rax()); + collector.reg_fixed_def(dst_old_high, regs::rdx()); + mem.get_operands_late(collector) + } + + Inst::Atomic128XchgSeq { + operand_low, + operand_high, + dst_old_low, + dst_old_high, + mem, + .. + } => { + // All registers are collected in the `Late` position so that they don't overlap. + collector.reg_fixed_late_use(operand_low, regs::rbx()); + collector.reg_fixed_late_use(operand_high, regs::rcx()); + collector.reg_fixed_def(dst_old_low, regs::rax()); + collector.reg_fixed_def(dst_old_high, regs::rdx()); + mem.get_operands_late(collector) + } + Inst::Args { args } => { for ArgPair { vreg, preg } in args { collector.reg_fixed_def(vreg, *preg); diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index a6f0af46ab97..ddab6f94aa61 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -3348,31 +3348,40 @@ ;; sequencing to satisfy the CLIF synchronisation requirements for `AtomicLoad` ;; without the need for any fence instructions. ;; -;; As described in the `atomic_load` documentation, this lowering is only valid -;; for I8, I16, I32, and I64. The sub-64-bit types are zero extended, as with a -;; normal load. +;; This lowering is only valid for I8, I16, I32, and I64. The sub-64-bit types +;; are zero extended, as with a normal load. (rule 1 (lower (has_type $I64 (atomic_load flags address))) (x64_mov (to_amode flags address (zero_offset)))) (rule (lower (has_type (and (fits_in_32 ty) (ty_int _)) (atomic_load flags address))) (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address (zero_offset)))) +;; Lower 128-bit `atomic_load` using `cmpxchg16b`. +(rule 1 (lower (has_type $I128 (atomic_load flags address))) + (if-let $true (use_cmpxchg16b)) + (x64_cmpxchg16b (value_regs (imm $I64 0) (imm $I64 0)) (value_regs (imm $I64 0) (imm $I64 0)) (to_amode flags address (zero_offset)))) ;; Rules for `atomic_store` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; This is a normal store followed by an `mfence` instruction. As described in -;; the `atomic_load` documentation, this lowering is only valid for I8, I16, -;; I32, and I64. +;; This is a normal store followed by an `mfence` instruction. This lowering is +;; only valid for I8, I16, I32, and I64. (rule (lower (atomic_store flags value @ (value_type (and (fits_in_64 ty) (ty_int _))) address)) (side_effect (side_effect_concat (x64_movrm ty (to_amode flags address (zero_offset)) value) (x64_mfence)))) +;; Lower 128-bit `atomic_store` using `cmpxchg16b`. +(rule 1 (lower (atomic_store flags value @ (value_type $I128) address)) + (if-let $true (use_cmpxchg16b)) + (side_effect (x64_atomic_128_store_seq (to_amode flags address (zero_offset)) value))) ;; Rules for `atomic_cas` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) (atomic_cas flags address expected replacement))) (x64_cmpxchg ty expected replacement (to_amode flags address (zero_offset)))) +(rule 1 (lower (has_type $I128 (atomic_cas flags address expected replacement))) + (if-let $true (use_cmpxchg16b)) + (x64_cmpxchg16b expected replacement (to_amode flags address (zero_offset)))) ;; Rules for `atomic_rmw` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3389,6 +3398,9 @@ (rule (lower (has_type (and (fits_in_64 ty) (ty_int _)) (atomic_rmw flags op address input))) (x64_atomic_rmw_seq ty op (to_amode flags address (zero_offset)) input)) +(rule 1 (lower (has_type $I128 (atomic_rmw flags op address input))) + (if-let $true (use_cmpxchg16b)) + (x64_atomic_128_rmw_seq op (to_amode flags address (zero_offset)) input)) ;; Rules for `call` and `call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/x64/lower/isle.rs b/cranelift/codegen/src/isa/x64/lower/isle.rs index 6b6473c0704a..ed61a4d92654 100644 --- a/cranelift/codegen/src/isa/x64/lower/isle.rs +++ b/cranelift/codegen/src/isa/x64/lower/isle.rs @@ -39,6 +39,7 @@ type BoxCallIndInfo = Box>; type BoxReturnCallInfo = Box>; type BoxReturnCallIndInfo = Box>; type VecArgPair = Vec; +type BoxSyntheticAmode = Box; pub struct SinkableLoad { inst: Inst, @@ -240,6 +241,11 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { self.backend.x64_flags.use_sse42() } + #[inline] + fn use_cmpxchg16b(&mut self) -> bool { + self.backend.x64_flags.use_cmpxchg16b() + } + #[inline] fn imm8_from_value(&mut self, val: Value) -> Option { let inst = self.lower_ctx.dfg().value_def(val).inst()?; @@ -614,6 +620,15 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { MachAtomicRmwOp::from(*op) } + #[inline] + fn mach_atomic_rmw_op_is_xchg(&mut self, op: &MachAtomicRmwOp) -> Option<()> { + if *op == MachAtomicRmwOp::Xchg { + Some(()) + } else { + None + } + } + #[inline] fn preg_rbp(&mut self) -> PReg { regs::rbp().to_real_reg().unwrap().into() @@ -939,6 +954,10 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> { let reg = Gpr::new(self.invalid_reg()).unwrap(); WritableGpr::from_reg(reg) } + + fn box_synthetic_amode(&mut self, amode: &SyntheticAmode) -> BoxSyntheticAmode { + Box::new(amode.clone()) + } } impl IsleContext<'_, '_, MInst, X64Backend> { diff --git a/cranelift/codegen/src/isa/x64/pcc.rs b/cranelift/codegen/src/isa/x64/pcc.rs index 326708bf0269..288272cac715 100644 --- a/cranelift/codegen/src/isa/x64/pcc.rs +++ b/cranelift/codegen/src/isa/x64/pcc.rs @@ -886,6 +886,18 @@ pub(crate) fn check( Ok(()) } + Inst::LockCmpxchg16b { + ref mem, + dst_old_low, + dst_old_high, + .. + } => { + ensure_no_fact(vcode, dst_old_low.to_reg())?; + ensure_no_fact(vcode, dst_old_high.to_reg())?; + check_store(ctx, None, mem, vcode, I128)?; + Ok(()) + } + Inst::AtomicRmwSeq { ref mem, temp, @@ -898,6 +910,34 @@ pub(crate) fn check( Ok(()) } + Inst::Atomic128RmwSeq { + ref mem, + temp_low, + temp_high, + dst_old_low, + dst_old_high, + .. + } => { + ensure_no_fact(vcode, dst_old_low.to_reg())?; + ensure_no_fact(vcode, dst_old_high.to_reg())?; + ensure_no_fact(vcode, temp_low.to_reg())?; + ensure_no_fact(vcode, temp_high.to_reg())?; + check_store(ctx, None, mem, vcode, I128)?; + Ok(()) + } + + Inst::Atomic128XchgSeq { + ref mem, + dst_old_low, + dst_old_high, + .. + } => { + ensure_no_fact(vcode, dst_old_low.to_reg())?; + ensure_no_fact(vcode, dst_old_high.to_reg())?; + check_store(ctx, None, mem, vcode, I128)?; + Ok(()) + } + Inst::Fence { .. } => Ok(()), Inst::XmmUninitializedValue { dst } => { diff --git a/cranelift/filetests/filetests/isa/x64/atomic-128.clif b/cranelift/filetests/filetests/isa/x64/atomic-128.clif new file mode 100644 index 000000000000..791578b6753e --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/atomic-128.clif @@ -0,0 +1,600 @@ +test compile precise-output +set enable_llvm_abi_extensions +target x86_64 has_cmpxchg16b + +function %load(i64) -> i128 { +block0(v0: i64): + v1 = atomic_load.i128 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; xorq %rax, %rax, %rax +; xorq %rdx, %rdx, %rdx +; xorq %rbx, %rbx, %rbx +; xorq %rcx, %rcx, %rcx +; lock cmpxchg16b 0(%rdi), replacement=%rcx:%rbx, expected=%rdx:%rax, dst_old=%rdx:%rax +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; xorq %rax, %rax +; xorq %rdx, %rdx +; xorq %rbx, %rbx +; xorq %rcx, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %store(i128, i64) { +block0(v0: i128, v1: i64): + atomic_store.i128 v0, v1 + return +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rsi, %rcx +; movq %rdi, %rbx +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%r11); 0(%r11) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rsi, %rcx +; movq %rdi, %rbx +; movq %rdx, %r11 +; movq (%r11), %rax ; trap: heap_oob +; movq 8(%r11), %rdx ; trap: heap_oob +; lock cmpxchg16b (%r11) ; trap: heap_oob +; jne 0x1c +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %cas(i64, i128, i128) -> i128 { +block0(v0: i64, v1: i128, v2: i128): + v3 = atomic_cas.i128 v0, v1, v2 + return v3 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rcx, %rbx +; movq %r8, %rcx +; movq %rsi, %rax +; lock cmpxchg16b 0(%rdi), replacement=%rcx:%rbx, expected=%rdx:%rax, dst_old=%rdx:%rax +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rcx, %rbx +; movq %r8, %rcx +; movq %rsi, %rax +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %add(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 add v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Add %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; addq %rsi, %rbx +; adcq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sub(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 sub v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Sub %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; subq %rsi, %rbx +; sbbq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %and(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 and v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax And %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; andq %rsi, %rbx +; andq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %nand(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 nand v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Nand %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; andq %rsi, %rbx +; andq %r11, %rcx +; notq %rbx +; notq %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %or(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 or v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Or %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; orq %rsi, %rbx +; orq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %xor(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 xor v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Xor %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; xorq %rsi, %rbx +; xorq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %xchg(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 xchg v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %rcx +; movq %rsi, %rbx +; atomically { %rdx:%rax = 0(%rdi); 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %rcx +; movq %rsi, %rbx +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x19 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %umin(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 umin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Umin %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; cmpq %rsi, %rbx +; sbbq %r11, %rcx +; movq %rdx, %rcx +; cmovaeq %rsi, %rbx +; cmovaeq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %umax(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 umax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Umax %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; cmpq %rsi, %rbx +; sbbq %r11, %rcx +; movq %rdx, %rcx +; cmovbq %rsi, %rbx +; cmovbq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %smin(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 smin v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Smin %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; cmpq %rsi, %rbx +; sbbq %r11, %rcx +; movq %rdx, %rcx +; cmovgeq %rsi, %rbx +; cmovgeq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %umax(i64, i128) -> i128 { +block0(v0: i64, v1: i128): + v2 = atomic_rmw.i128 smax v0, v1 + return v2 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; movq %rbx, 0(%rsp) +; block0: +; movq %rdx, %r11 +; atomically { %rdx:%rax = 0(%rdi); %rcx:%rbx = %rdx:%rax Smax %r11:%rsi; 0(%rdi) = %rcx:%rbx } +; movq 0(%rsp), %rbx +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; movq %rbx, (%rsp) +; block1: ; offset 0xc +; movq %rdx, %r11 +; movq (%rdi), %rax ; trap: heap_oob +; movq 8(%rdi), %rdx ; trap: heap_oob +; movq %rax, %rbx +; movq %rdx, %rcx +; cmpq %rsi, %rbx +; sbbq %r11, %rcx +; movq %rdx, %rcx +; cmovlq %rsi, %rbx +; cmovlq %r11, %rcx +; lock cmpxchg16b (%rdi) ; trap: heap_oob +; jne 0x16 +; movq (%rsp), %rbx +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/atomic-128.clif b/cranelift/filetests/filetests/runtests/atomic-128.clif new file mode 100644 index 000000000000..8ffe27ab457a --- /dev/null +++ b/cranelift/filetests/filetests/runtests/atomic-128.clif @@ -0,0 +1,274 @@ +test interpret +test run +set enable_llvm_abi_extensions +target x86_64 has_cmpxchg16b + +function %atomic_load(i128) -> i128 { + ss0 = explicit_slot 16 + +block0(v0: i128): + stack_store.i128 v0, ss0 + v1 = stack_addr.i64 ss0 + v2 = atomic_load.i128 v1 + return v2 +} +; run: %atomic_load(0) == 0 +; run: %atomic_load(-1) == -1 +; run: %atomic_load(0x00000000_00000000_FFFFFFFF_FFFFFFFF) == 0x00000000_00000000_FFFFFFFF_FFFFFFFF +; run: %atomic_load(0xFFFFFFFF_FFFFFFFF_00000000_00000000) == 0xFFFFFFFF_FFFFFFFF_00000000_00000000 +; run: %atomic_load(0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == 0xFEDCBA98_76543210_F7E6D5C4_B3A29180 +; run: %atomic_load(0xA00A00A0_0A00A00A_00A00A00_A00A00A0) == 0xA00A00A0_0A00A00A_00A00A00_A00A00A0 +; run: %atomic_load(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678) == 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678 + + +function %atomic_store(i128) -> i128 { + ss0 = explicit_slot 16 + +block0(v0: i128): + v1 = stack_addr.i64 ss0 + atomic_store.i128 v0, v1 + v2 = stack_load.i128 ss0 + return v2 +} +; run: %atomic_store(0) == 0 +; run: %atomic_store(-1) == -1 +; run: %atomic_store(0x00000000_00000000_FFFFFFFF_FFFFFFFF) == 0x00000000_00000000_FFFFFFFF_FFFFFFFF +; run: %atomic_store(0xFFFFFFFF_FFFFFFFF_00000000_00000000) == 0xFFFFFFFF_FFFFFFFF_00000000_00000000 +; run: %atomic_store(0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == 0xFEDCBA98_76543210_F7E6D5C4_B3A29180 +; run: %atomic_store(0xA00A00A0_0A00A00A_00A00A00_A00A00A0) == 0xA00A00A0_0A00A00A_00A00A00_A00A00A0 +; run: %atomic_store(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678) == 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678 + + +function %atomic_cas(i128, i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128, v2: i128): + stack_store.i128 v0, ss0 + v3 = stack_addr.i64 ss0 + v4 = atomic_cas.i128 v3, v1, v2 + v5 = stack_load.i128 ss0 + return v5, v4 +} + +; run: %atomic_cas(0, 0, 2) == [2, 0] +; run: %atomic_cas(1, 0, 2) == [1, 1] +; run: %atomic_cas(0, 1, 2) == [0, 0] +; run: %atomic_cas(0, 0xC0FFEEEE_ABCDEF01_00000000_00000000, 0xDECAFFFF_12345678) == [0, 0] +; run: %atomic_cas(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xFEDCBA98_76543210_F7E6D5C4_B3A29180, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_add(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 add v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_add(0, 0) == [0, 0] +; run: %atomic_add(1, 0) == [1, 1] +; run: %atomic_add(0, 1) == [1, 0] +; run: %atomic_add(1, 1) == [2, 1] +; run: %atomic_add(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_add(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xBFDCA987_22222112_D6B1D5C3_C5D6E7F8, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_sub(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 sub v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_sub(0, 0) == [0, 0] +; run: %atomic_sub(1, 0) == [1, 1] +; run: %atomic_sub(0, 1) == [-1, 0] +; run: %atomic_sub(1, 1) == [0, 1] +; run: %atomic_sub(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0x3F001111_543210FF_DECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_sub(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xC2233456_3579BCF0_E6E42A3A_5E91C4F8, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_and(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 and v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_and(0, 0) == [0, 0] +; run: %atomic_and(1, 0) == [0, 1] +; run: %atomic_and(0, 1) == [0, 0] +; run: %atomic_and(1, 1) == [1, 1] +; run: %atomic_and(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0, 0xDECAFFFF_12345678] +; run: %atomic_and(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xC0DCAA88_22442200_D6C2D5C4_12201000, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_nand(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 nand v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_nand(0, 0) == [-1, 0] +; run: %atomic_nand(1, 0) == [-1, 1] +; run: %atomic_nand(0, 1) == [-1, 0] +; run: %atomic_nand(1, 1) == [-2, 1] +; run: %atomic_nand(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [-1, 0xDECAFFFF_12345678] +; run: %atomic_nand(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0x3F235577_DDBBDDFF_293D2A3B_EDDFEFFF, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_or(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 or v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_or(0, 0) == [0, 0] +; run: %atomic_or(1, 0) == [1, 1] +; run: %atomic_or(0, 1) == [1, 0] +; run: %atomic_or(1, 1) == [1, 1] +; run: %atomic_or(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_or(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xFEFFFEFE_FFDDFF11_FFEEFFFF_B3B6D7F8, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_xor(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 xor v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_xor(0, 0) == [0, 0] +; run: %atomic_xor(1, 0) == [1, 1] +; run: %atomic_xor(0, 1) == [1, 0] +; run: %atomic_xor(1, 1) == [0, 1] +; run: %atomic_xor(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_xor(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0x3E235476_DD99DD11_292C2A3B_A196C7F8, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_xchg(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 xchg v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_xchg(0, 0) == [0, 0] +; run: %atomic_xchg(1, 0) == [0, 1] +; run: %atomic_xchg(0, 1) == [1, 0] +; run: %atomic_xchg(1, 1) == [1, 1] +; run: %atomic_xchg(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_00000000_00000000, 0xDECAFFFF_12345678] +; run: %atomic_xchg(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xFEDCBA98_76543210_F7E6D5C4_B3A29180, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_umin(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 umin v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_umin(0, 0) == [0, 0] +; run: %atomic_umin(1, 0) == [0, 1] +; run: %atomic_umin(0, 1) == [0, 0] +; run: %atomic_umin(1, 1) == [1, 1] +; run: %atomic_umin(-1, 1) == [1, -1] +; run: %atomic_umin(1, -1) == [1, 1] +; run: %atomic_umin(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xDECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_umin(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_umax(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 umax v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_umax(0, 0) == [0, 0] +; run: %atomic_umax(1, 0) == [1, 1] +; run: %atomic_umax(0, 1) == [1, 0] +; run: %atomic_umax(1, 1) == [1, 1] +; run: %atomic_umax(-1, 1) == [-1, -1] +; run: %atomic_umax(1, -1) == [-1, 1] +; run: %atomic_umax(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_00000000_00000000, 0xDECAFFFF_12345678] +; run: %atomic_umax(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xFEDCBA98_76543210_F7E6D5C4_B3A29180, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_smin(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 smin v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_smin(0, 0) == [0, 0] +; run: %atomic_smin(1, 0) == [0, 1] +; run: %atomic_smin(0, 1) == [0, 0] +; run: %atomic_smin(1, 1) == [1, 1] +; run: %atomic_smin(-1, 1) == [-1, -1] +; run: %atomic_smin(1, -1) == [-1, 1] +; run: %atomic_smin(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xC0FFEEEE_ABCDEF01_00000000_00000000, 0xDECAFFFF_12345678] +; run: %atomic_smin(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] + + +function %atomic_smax(i128, i128) -> i128, i128 { + ss0 = explicit_slot 16 + +block0(v0: i128, v1: i128): + stack_store.i128 v0, ss0 + v2 = stack_addr.i64 ss0 + v3 = atomic_rmw.i128 smax v2, v1 + v4 = stack_load.i128 ss0 + return v4, v3 +} + +; run: %atomic_smax(0, 0) == [0, 0] +; run: %atomic_smax(1, 0) == [1, 1] +; run: %atomic_smax(0, 1) == [1, 0] +; run: %atomic_smax(1, 1) == [1, 1] +; run: %atomic_smax(-1, 1) == [1, -1] +; run: %atomic_smax(1, -1) == [1, 1] +; run: %atomic_smax(0xDECAFFFF_12345678, 0xC0FFEEEE_ABCDEF01_00000000_00000000) == [0xDECAFFFF_12345678, 0xDECAFFFF_12345678] +; run: %atomic_smax(0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678, 0xFEDCBA98_76543210_F7E6D5C4_B3A29180) == [0xFEDCBA98_76543210_F7E6D5C4_B3A29180, 0xC0FFEEEE_ABCDEF01_DECAFFFF_12345678] diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs index a9d638b188ba..f159a33d5ed7 100644 --- a/cranelift/native/src/lib.rs +++ b/cranelift/native/src/lib.rs @@ -49,6 +49,9 @@ pub fn infer_native_flags(isa_builder: &mut dyn Configurable) -> Result<(), &'st return Err("x86 support requires SSE2"); } + if std::is_x86_feature_detected!("cmpxchg16b") { + isa_builder.enable("has_cmpxchg16b").unwrap(); + } if std::is_x86_feature_detected!("sse3") { isa_builder.enable("has_sse3").unwrap(); } diff --git a/crates/fuzzing/src/generators/codegen_settings.rs b/crates/fuzzing/src/generators/codegen_settings.rs index 62d58c4e85da..c394c2326c02 100644 --- a/crates/fuzzing/src/generators/codegen_settings.rs +++ b/crates/fuzzing/src/generators/codegen_settings.rs @@ -98,6 +98,7 @@ impl<'a> Arbitrary<'a> for CodegenSettings { "x86_64" => { test: is_x86_feature_detected, + std:"cmpxchg16b" => clif:"has_cmpxchg16b", std:"sse3" => clif:"has_sse3", std:"ssse3" => clif:"has_ssse3", std:"sse4.1" => clif:"has_sse41", diff --git a/crates/wasmtime/src/config.rs b/crates/wasmtime/src/config.rs index 5aa1e89c9af2..26337fbb6227 100644 --- a/crates/wasmtime/src/config.rs +++ b/crates/wasmtime/src/config.rs @@ -3066,6 +3066,7 @@ fn detect_host_feature(feature: &str) -> Option { #[cfg(target_arch = "x86_64")] { return match feature { + "cmpxchg16b" => Some(std::is_x86_feature_detected!("cmpxchg16b")), "sse3" => Some(std::is_x86_feature_detected!("sse3")), "ssse3" => Some(std::is_x86_feature_detected!("ssse3")), "sse4.1" => Some(std::is_x86_feature_detected!("sse4.1")), diff --git a/crates/wasmtime/src/engine.rs b/crates/wasmtime/src/engine.rs index c2458bdce4a9..de6b2f4a4b18 100644 --- a/crates/wasmtime/src/engine.rs +++ b/crates/wasmtime/src/engine.rs @@ -426,6 +426,7 @@ impl Engine { "has_mie2" => "mie2", // x64 features to detect + "has_cmpxchg16b" => "cmpxchg16b", "has_sse3" => "sse3", "has_ssse3" => "ssse3", "has_sse41" => "sse4.1", From 89b364ed785b47ad60a096f4a543851e8546c8e1 Mon Sep 17 00:00:00 2001 From: Nick Fitzgerald Date: Mon, 14 Oct 2024 09:13:41 -0700 Subject: [PATCH 2/2] fix typo in cranelift/codegen/src/isa/x64/inst/emit.rs --- cranelift/codegen/src/isa/x64/inst/emit.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 3b1031c8adbd..0f5afdcc2381 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -4261,7 +4261,7 @@ pub(crate) fn emit( // `cmp_rmi_r` and `alu_rmi_r` have opposite argument orders. Inst::cmp_rmi_r(OperandSize::Size64, temp_low.to_reg(), operand_low_rmi) .emit(sink, info, state); - // Thie will clobber `temp_high` + // This will clobber `temp_high` Inst::alu_rmi_r( OperandSize::Size64, AluRmiROpcode::Sbb,