From 85f16f488d4a0047e40a885fdacda832d46815e8 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 4 Aug 2021 15:57:56 -0500
Subject: [PATCH] Consolidate address calculations for atomics (#3143)

* Consolidate address calculations for atomics

This commit consolidates all calcuations of guest addresses into one
`prepare_addr` function. This notably remove the atomics-specifics paths
as well as the `prepare_load` function (now renamed to `prepare_addr`
and folded into `get_heap_addr`).

The goal of this commit is to simplify how addresses are managed in the
code generator for atomics to use all the shared infrastrucutre of other
loads/stores as well. This additionally fixes #3132 via the use of
`heap_addr` in clif for all operations.

I also added a number of tests for loads/stores with varying alignments.
Originally I was going to allow loads/stores to not be aligned since
that's what the current formal specification says, but the overview of
the threads proposal disagrees with the formal specification, so I
figured I'd leave it as-is but adding tests probably doesn't hurt.

Closes #3132

* Fix old backend

* Guarantee misalignment checks happen before out-of-bounds
---
 cranelift/wasm/src/code_translator.rs         | 289 +++++++-----------
 cranelift/wasm/src/environ/spec.rs            |   8 +
 crates/environ/src/builtin.rs                 |   6 +-
 crates/runtime/src/libcalls.rs                | 102 +++++--
 tests/all/wast.rs                             |   5 +
 .../threads/load-store-alignment.wast         | 126 ++++++++
 6 files changed, 328 insertions(+), 208 deletions(-)
 create mode 100644 tests/misc_testsuite/threads/load-store-alignment.wast

diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index da53a5dc48bc..914b7beb0595 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -694,32 +694,32 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             translate_load(memarg, ir::Opcode::Load, I8X16, builder, state, environ)?;
         }
         Operator::V128Load8x8S { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().sload8x8(flags, base, offset);
             state.push1(loaded);
         }
         Operator::V128Load8x8U { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().uload8x8(flags, base, offset);
             state.push1(loaded);
         }
         Operator::V128Load16x4S { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().sload16x4(flags, base, offset);
             state.push1(loaded);
         }
         Operator::V128Load16x4U { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().uload16x4(flags, base, offset);
             state.push1(loaded);
         }
         Operator::V128Load32x2S { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().sload32x2(flags, base, offset);
             state.push1(loaded);
         }
         Operator::V128Load32x2U { memarg } => {
-            let (flags, base, offset) = prepare_load(memarg, 8, builder, state, environ)?;
+            let (flags, base, offset) = prepare_addr(memarg, 8, builder, state, environ)?;
             let loaded = builder.ins().uload32x2(flags, base, offset);
             state.push1(loaded);
         }
@@ -1064,8 +1064,8 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let heap = state.get_heap(builder.func, memarg.memory, environ)?;
             let timeout = state.pop1(); // 64 (fixed)
             let expected = state.pop1(); // 32 or 64 (per the `Ixx` in `IxxAtomicWait`)
-            let addr = state.pop1(); // 32 (fixed)
-            let addr = fold_atomic_mem_addr(addr, memarg, implied_ty, builder);
+            let (_flags, addr) =
+                prepare_atomic_addr(memarg, implied_ty.bytes(), builder, state, environ)?;
             assert!(builder.func.dfg.value_type(expected) == implied_ty);
             // `fn translate_atomic_wait` can inspect the type of `expected` to figure out what
             // code it needs to generate, if it wants.
@@ -1083,8 +1083,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let heap_index = MemoryIndex::from_u32(memarg.memory);
             let heap = state.get_heap(builder.func, memarg.memory, environ)?;
             let count = state.pop1(); // 32 (fixed)
-            let addr = state.pop1(); // 32 (fixed)
-            let addr = fold_atomic_mem_addr(addr, memarg, I32, builder);
+
+            // `memory.atomic.notify` is defined to have an access size of 4
+            // bytes in the spec, even though it doesn't necessarily access memory.
+            let (_flags, addr) = prepare_atomic_addr(memarg, 4, builder, state, environ)?;
             let res =
                 environ.translate_atomic_notify(builder.cursor(), heap_index, heap, addr, count)?;
             state.push1(res);
@@ -2146,23 +2148,29 @@ fn translate_unreachable_operator<FE: FuncEnvironment + ?Sized>(
     Ok(())
 }
 
-/// Get the address+offset to use for a heap access.
-fn get_heap_addr(
-    heap: ir::Heap,
-    addr32: ir::Value,
-    offset: u64,
-    width: u32,
-    addr_ty: Type,
+/// This function is a generalized helper for validating that a wasm-supplied
+/// heap address is in-bounds.
+///
+/// This function takes a litany of parameters and requires that the address to
+/// be verified is at the top of the stack in `state`. This will generate
+/// necessary IR to validate that the heap address is correctly in-bounds, and
+/// various parameters are returned describing the valid heap address if
+/// execution reaches that point.
+fn prepare_addr<FE: FuncEnvironment + ?Sized>(
+    memarg: &MemoryImmediate,
+    access_size: u32,
     builder: &mut FunctionBuilder,
-) -> (ir::Value, i32) {
+    state: &mut FuncTranslationState,
+    environ: &mut FE,
+) -> WasmResult<(MemFlags, Value, Offset32)> {
+    let addr = state.pop1();
     // This function will need updates for 64-bit memories
-    debug_assert_eq!(builder.func.dfg.value_type(addr32), I32);
+    debug_assert_eq!(builder.func.dfg.value_type(addr), I32);
+    let offset = u32::try_from(memarg.offset).unwrap();
 
+    let heap = state.get_heap(builder.func, memarg.memory, environ)?;
     let offset_guard_size: u64 = builder.func.heaps[heap].offset_guard_size.into();
 
-    // Currently this function only supports 32-bit memories.
-    let offset = u32::try_from(offset).unwrap();
-
     // How exactly the bounds check is performed here and what it's performed
     // on is a bit tricky. Generally we want to rely on access violations (e.g.
     // segfaults) to generate traps since that means we don't have to bounds
@@ -2214,54 +2222,86 @@ fn get_heap_addr(
     // offsets we're checking here are zero. This means that we'll hit the fast
     // path and emit zero conditional traps for bounds checks
     let adjusted_offset = if offset_guard_size == 0 {
-        u64::from(offset) + u64::from(width)
+        u64::from(offset) + u64::from(access_size)
     } else {
-        assert!(width < 1024);
+        assert!(access_size < 1024);
         cmp::max(u64::from(offset) / offset_guard_size * offset_guard_size, 1)
     };
     debug_assert!(adjusted_offset > 0); // want to bounds check at least 1 byte
     let check_size = u32::try_from(adjusted_offset).unwrap_or(u32::MAX);
-    let base = builder.ins().heap_addr(addr_ty, heap, addr32, check_size);
+    let base = builder
+        .ins()
+        .heap_addr(environ.pointer_type(), heap, addr, check_size);
 
     // Native load/store instructions take a signed `Offset32` immediate, so adjust the base
     // pointer if necessary.
-    if offset > i32::MAX as u32 {
+    let (addr, offset) = if offset > i32::MAX as u32 {
         // Offset doesn't fit in the load/store instruction.
         let adj = builder.ins().iadd_imm(base, i64::from(i32::MAX) + 1);
         (adj, (offset - (i32::MAX as u32 + 1)) as i32)
     } else {
         (base, offset as i32)
-    }
+    };
+
+    // Note that we don't set `is_aligned` here, even if the load instruction's
+    // alignment immediate may says it's aligned, because WebAssembly's
+    // immediate field is just a hint, while Cranelift's aligned flag needs a
+    // guarantee. WebAssembly memory accesses are always little-endian.
+    let mut flags = MemFlags::new();
+    flags.set_endianness(ir::Endianness::Little);
+
+    Ok((flags, addr, offset.into()))
 }
 
-/// Prepare for a load; factors out common functionality between load and load_extend operations.
-fn prepare_load<FE: FuncEnvironment + ?Sized>(
+fn prepare_atomic_addr<FE: FuncEnvironment + ?Sized>(
     memarg: &MemoryImmediate,
     loaded_bytes: u32,
     builder: &mut FunctionBuilder,
     state: &mut FuncTranslationState,
     environ: &mut FE,
-) -> WasmResult<(MemFlags, Value, Offset32)> {
-    let addr = state.pop1();
+) -> WasmResult<(MemFlags, Value)> {
+    // Atomic addresses must all be aligned correctly, and for now we check
+    // alignment before we check out-of-bounds-ness. The order of this check may
+    // need to be updated depending on the outcome of the official threads
+    // proposal itself.
+    //
+    // Note that with an offset>0 we generate an `iadd_imm` where the result is
+    // thrown away after the offset check. This may truncate the offset and the
+    // result may overflow as well, but those conditions won't affect the
+    // alignment check itself. This can probably be optimized better and we
+    // should do so in the future as well.
+    if loaded_bytes > 1 {
+        let addr = state.pop1(); // "peek" via pop then push
+        state.push1(addr);
+        let effective_addr = if memarg.offset == 0 {
+            addr
+        } else {
+            builder
+                .ins()
+                .iadd_imm(addr, i64::from(memarg.offset as i32))
+        };
+        debug_assert!(loaded_bytes.is_power_of_two());
+        let misalignment = builder
+            .ins()
+            .band_imm(effective_addr, i64::from(loaded_bytes - 1));
+        let f = builder.ins().ifcmp_imm(misalignment, 0);
+        builder
+            .ins()
+            .trapif(IntCC::NotEqual, f, ir::TrapCode::HeapMisaligned);
+    }
 
-    let heap = state.get_heap(builder.func, memarg.memory, environ)?;
-    let (base, offset) = get_heap_addr(
-        heap,
-        addr,
-        memarg.offset,
-        loaded_bytes,
-        environ.pointer_type(),
-        builder,
-    );
+    let (flags, mut addr, offset) = prepare_addr(memarg, loaded_bytes, builder, state, environ)?;
 
-    // Note that we don't set `is_aligned` here, even if the load instruction's
-    // alignment immediate says it's aligned, because WebAssembly's immediate
-    // field is just a hint, while Cranelift's aligned flag needs a guarantee.
-    // WebAssembly memory accesses are always little-endian.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
+    // Currently cranelift IR operations for atomics don't have offsets
+    // associated with them so we fold the offset into the address itself. Note
+    // that via the `prepare_addr` helper we know that if execution reaches
+    // this point that this addition won't overflow.
+    let offset: i64 = offset.into();
+    if offset != 0 {
+        addr = builder.ins().iadd_imm(addr, offset);
+    }
 
-    Ok((flags, base, offset.into()))
+    Ok((flags, addr))
 }
 
 /// Translate a load instruction.
@@ -2273,7 +2313,7 @@ fn translate_load<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (flags, base, offset) = prepare_load(
+    let (flags, base, offset) = prepare_addr(
         memarg,
         mem_op_size(opcode, result_ty),
         builder,
@@ -2293,21 +2333,11 @@ fn translate_store<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (addr32, val) = state.pop2();
+    let val = state.pop1();
     let val_ty = builder.func.dfg.value_type(val);
 
-    let heap = state.get_heap(builder.func, memarg.memory, environ)?;
-    let (base, offset) = get_heap_addr(
-        heap,
-        addr32,
-        memarg.offset,
-        mem_op_size(opcode, val_ty),
-        environ.pointer_type(),
-        builder,
-    );
-    // See the comments in `prepare_load` about the flags.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
+    let (flags, base, offset) =
+        prepare_addr(memarg, mem_op_size(opcode, val_ty), builder, state, environ)?;
     builder
         .ins()
         .Store(opcode, val_ty, flags, offset.into(), val, base);
@@ -2330,92 +2360,6 @@ fn translate_icmp(cc: IntCC, builder: &mut FunctionBuilder, state: &mut FuncTran
     state.push1(builder.ins().bint(I32, val));
 }
 
-fn fold_atomic_mem_addr(
-    linear_mem_addr: Value,
-    memarg: &MemoryImmediate,
-    access_ty: Type,
-    builder: &mut FunctionBuilder,
-) -> Value {
-    let access_ty_bytes = access_ty.bytes();
-    let final_lma = if memarg.offset > 0 {
-        // Note that 32-bit memories are only supported here at this time, the
-        // logic here (e.g. the `iadd_imm` will need to check for overflow and
-        // other bits and pieces for 64-bit memories.
-        assert!(builder.func.dfg.value_type(linear_mem_addr) == I32);
-        let linear_mem_addr = builder.ins().uextend(I64, linear_mem_addr);
-        let a = builder
-            .ins()
-            .iadd_imm(linear_mem_addr, i64::try_from(memarg.offset).unwrap());
-        let cflags = builder.ins().ifcmp_imm(a, 0x1_0000_0000i64);
-        builder.ins().trapif(
-            IntCC::UnsignedGreaterThanOrEqual,
-            cflags,
-            ir::TrapCode::HeapOutOfBounds,
-        );
-        builder.ins().ireduce(I32, a)
-    } else {
-        linear_mem_addr
-    };
-    assert!(access_ty_bytes == 4 || access_ty_bytes == 8);
-    let final_lma_misalignment = builder
-        .ins()
-        .band_imm(final_lma, i64::from(access_ty_bytes - 1));
-    let f = builder
-        .ins()
-        .ifcmp_imm(final_lma_misalignment, i64::from(0));
-    builder
-        .ins()
-        .trapif(IntCC::NotEqual, f, ir::TrapCode::HeapMisaligned);
-    final_lma
-}
-
-// For an atomic memory operation, emit an alignment check for the linear memory address,
-// and then compute the final effective address.
-fn finalise_atomic_mem_addr<FE: FuncEnvironment + ?Sized>(
-    linear_mem_addr: Value,
-    memarg: &MemoryImmediate,
-    access_ty: Type,
-    builder: &mut FunctionBuilder,
-    state: &mut FuncTranslationState,
-    environ: &mut FE,
-) -> WasmResult<Value> {
-    // Check the alignment of `linear_mem_addr`.
-    //
-    // Note that the `iadd_imm` here and the `try_from` only works for 32-bit
-    // memories.
-    let access_ty_bytes = access_ty.bytes();
-    assert!(builder.func.dfg.value_type(linear_mem_addr) == I32);
-    let final_lma = builder
-        .ins()
-        .iadd_imm(linear_mem_addr, i64::try_from(memarg.offset).unwrap());
-    if access_ty_bytes != 1 {
-        assert!(access_ty_bytes == 2 || access_ty_bytes == 4 || access_ty_bytes == 8);
-        let final_lma_misalignment = builder
-            .ins()
-            .band_imm(final_lma, i64::from(access_ty_bytes - 1));
-        let f = builder
-            .ins()
-            .ifcmp_imm(final_lma_misalignment, i64::from(0));
-        builder
-            .ins()
-            .trapif(IntCC::NotEqual, f, ir::TrapCode::HeapMisaligned);
-    }
-
-    // Compute the final effective address.
-    let heap = state.get_heap(builder.func, memarg.memory, environ)?;
-    let (base, offset) = get_heap_addr(
-        heap,
-        final_lma,
-        /*offset=*/ 0,
-        access_ty.bytes(),
-        environ.pointer_type(),
-        builder,
-    );
-
-    let final_effective_address = builder.ins().iadd_imm(base, i64::from(offset));
-    Ok(final_effective_address)
-}
-
 fn translate_atomic_rmw<FE: FuncEnvironment + ?Sized>(
     widened_ty: Type,
     access_ty: Type,
@@ -2425,7 +2369,7 @@ fn translate_atomic_rmw<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (linear_mem_addr, mut arg2) = state.pop2();
+    let mut arg2 = state.pop1();
     let arg2_ty = builder.func.dfg.value_type(arg2);
 
     // The operation is performed at type `access_ty`, and the old value is zero-extended
@@ -2450,15 +2394,9 @@ fn translate_atomic_rmw<FE: FuncEnvironment + ?Sized>(
         arg2 = builder.ins().ireduce(access_ty, arg2);
     }
 
-    let final_effective_address =
-        finalise_atomic_mem_addr(linear_mem_addr, memarg, access_ty, builder, state, environ)?;
+    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
 
-    // See the comments in `prepare_load` about the flags.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
-    let mut res = builder
-        .ins()
-        .atomic_rmw(access_ty, flags, op, final_effective_address, arg2);
+    let mut res = builder.ins().atomic_rmw(access_ty, flags, op, addr, arg2);
     if access_ty != widened_ty {
         res = builder.ins().uextend(widened_ty, res);
     }
@@ -2474,7 +2412,7 @@ fn translate_atomic_cas<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (linear_mem_addr, mut expected, mut replacement) = state.pop3();
+    let (mut expected, mut replacement) = state.pop2();
     let expected_ty = builder.func.dfg.value_type(expected);
     let replacement_ty = builder.func.dfg.value_type(replacement);
 
@@ -2504,15 +2442,8 @@ fn translate_atomic_cas<FE: FuncEnvironment + ?Sized>(
         replacement = builder.ins().ireduce(access_ty, replacement);
     }
 
-    let final_effective_address =
-        finalise_atomic_mem_addr(linear_mem_addr, memarg, access_ty, builder, state, environ)?;
-
-    // See the comments in `prepare_load` about the flags.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
-    let mut res = builder
-        .ins()
-        .atomic_cas(flags, final_effective_address, expected, replacement);
+    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let mut res = builder.ins().atomic_cas(flags, addr, expected, replacement);
     if access_ty != widened_ty {
         res = builder.ins().uextend(widened_ty, res);
     }
@@ -2528,8 +2459,6 @@ fn translate_atomic_load<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let linear_mem_addr = state.pop1();
-
     // The load is performed at type `access_ty`, and the loaded value is zero extended
     // to `widened_ty`.
     match access_ty {
@@ -2547,15 +2476,8 @@ fn translate_atomic_load<FE: FuncEnvironment + ?Sized>(
     };
     assert!(w_ty_ok && widened_ty.bytes() >= access_ty.bytes());
 
-    let final_effective_address =
-        finalise_atomic_mem_addr(linear_mem_addr, memarg, access_ty, builder, state, environ)?;
-
-    // See the comments in `prepare_load` about the flags.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
-    let mut res = builder
-        .ins()
-        .atomic_load(access_ty, flags, final_effective_address);
+    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    let mut res = builder.ins().atomic_load(access_ty, flags, addr);
     if access_ty != widened_ty {
         res = builder.ins().uextend(widened_ty, res);
     }
@@ -2570,7 +2492,7 @@ fn translate_atomic_store<FE: FuncEnvironment + ?Sized>(
     state: &mut FuncTranslationState,
     environ: &mut FE,
 ) -> WasmResult<()> {
-    let (linear_mem_addr, mut data) = state.pop2();
+    let mut data = state.pop1();
     let data_ty = builder.func.dfg.value_type(data);
 
     // The operation is performed at type `access_ty`, and the data to be stored may first
@@ -2594,15 +2516,8 @@ fn translate_atomic_store<FE: FuncEnvironment + ?Sized>(
         data = builder.ins().ireduce(access_ty, data);
     }
 
-    let final_effective_address =
-        finalise_atomic_mem_addr(linear_mem_addr, memarg, access_ty, builder, state, environ)?;
-
-    // See the comments in `prepare_load` about the flags.
-    let mut flags = MemFlags::new();
-    flags.set_endianness(ir::Endianness::Little);
-    builder
-        .ins()
-        .atomic_store(flags, data, final_effective_address);
+    let (flags, addr) = prepare_atomic_addr(memarg, access_ty.bytes(), builder, state, environ)?;
+    builder.ins().atomic_store(flags, data, addr);
     Ok(())
 }
 
diff --git a/cranelift/wasm/src/environ/spec.rs b/cranelift/wasm/src/environ/spec.rs
index 474aa059014d..7949acc6b0c1 100644
--- a/cranelift/wasm/src/environ/spec.rs
+++ b/cranelift/wasm/src/environ/spec.rs
@@ -613,6 +613,10 @@ pub trait FuncEnvironment: TargetEnvironment {
     /// for the same index.  Whether the waited-on value is 32- or 64-bit can be
     /// determined by examining the type of `expected`, which must be only I32 or I64.
     ///
+    /// Note that the `addr` here is the host linear memory address rather
+    /// than a relative wasm linear memory address. The type of this value is
+    /// the same as the host's pointer.
+    ///
     /// Returns an i32, which is negative if the helper call failed.
     fn translate_atomic_wait(
         &mut self,
@@ -629,6 +633,10 @@ pub trait FuncEnvironment: TargetEnvironment {
     /// to wait on, and `heap` is the heap reference returned by `make_heap`
     /// for the same index.
     ///
+    /// Note that the `addr` here is the host linear memory address rather
+    /// than a relative wasm linear memory address. The type of this value is
+    /// the same as the host's pointer.
+    ///
     /// Returns an i64, which is negative if the helper call failed.
     fn translate_atomic_notify(
         &mut self,
diff --git a/crates/environ/src/builtin.rs b/crates/environ/src/builtin.rs
index 2d56cf9b10ad..49c3f16dfe9c 100644
--- a/crates/environ/src/builtin.rs
+++ b/crates/environ/src/builtin.rs
@@ -38,11 +38,11 @@ macro_rules! foreach_builtin_function {
             /// Returns an index for Wasm's `global.get` instruction for `externref`s.
             externref_global_set(vmctx, i32, reference) -> ();
             /// Returns an index for wasm's `memory.atomic.notify` instruction.
-            memory_atomic_notify(vmctx, i32, i32, i32) -> (i32);
+            memory_atomic_notify(vmctx, i32, pointer, i32) -> (i32);
             /// Returns an index for wasm's `memory.atomic.wait32` instruction.
-            memory_atomic_wait32(vmctx, i32, i32, i32, i64) -> (i32);
+            memory_atomic_wait32(vmctx, i32, pointer, i32, i64) -> (i32);
             /// Returns an index for wasm's `memory.atomic.wait64` instruction.
-            memory_atomic_wait64(vmctx, i32, i32, i64, i64) -> (i32);
+            memory_atomic_wait64(vmctx, i32, pointer, i64, i64) -> (i32);
             /// Invoked when fuel has run out while executing a function.
             out_of_gas(vmctx) -> ();
         }
diff --git a/crates/runtime/src/libcalls.rs b/crates/runtime/src/libcalls.rs
index a578e7f3d605..816e888a3169 100644
--- a/crates/runtime/src/libcalls.rs
+++ b/crates/runtime/src/libcalls.rs
@@ -57,11 +57,14 @@
 //!   ```
 
 use crate::externref::VMExternRef;
+use crate::instance::Instance;
 use crate::table::Table;
 use crate::traphandlers::{raise_lib_trap, Trap};
 use crate::vmcontext::{VMCallerCheckedAnyfunc, VMContext};
+use backtrace::Backtrace;
 use std::mem;
 use std::ptr::{self, NonNull};
+use wasmtime_environ::ir::TrapCode;
 use wasmtime_environ::wasm::{
     DataIndex, ElemIndex, GlobalIndex, MemoryIndex, TableElementType, TableIndex,
 };
@@ -449,40 +452,103 @@ impl std::fmt::Display for Unimplemented {
 
 /// Implementation of `memory.atomic.notify` for locally defined memories.
 pub unsafe extern "C" fn wasmtime_memory_atomic_notify(
-    _vmctx: *mut VMContext,
-    _memory_index: u32,
-    _addr: u32,
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    addr: usize,
     _count: u32,
 ) -> u32 {
-    raise_lib_trap(Trap::User(Box::new(Unimplemented(
-        "wasm atomics (fn wasmtime_memory_atomic_notify) unsupported",
-    ))));
+    let result = {
+        let memory = MemoryIndex::from_u32(memory_index);
+        let instance = (*vmctx).instance();
+        // this should never overflow since addr + 4 either hits a guard page
+        // or it's been validated to be in-bounds already. Double-check for now
+        // just to be sure.
+        let addr_to_check = addr.checked_add(4).unwrap();
+        validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| {
+            Err(Trap::User(Box::new(Unimplemented(
+                "wasm atomics (fn wasmtime_memory_atomic_notify) unsupported",
+            ))))
+        })
+    };
+    match result {
+        Ok(n) => n,
+        Err(e) => raise_lib_trap(e),
+    }
 }
 
 /// Implementation of `memory.atomic.wait32` for locally defined memories.
 pub unsafe extern "C" fn wasmtime_memory_atomic_wait32(
-    _vmctx: *mut VMContext,
-    _memory_index: u32,
-    _addr: u32,
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    addr: usize,
     _expected: u32,
     _timeout: u64,
 ) -> u32 {
-    raise_lib_trap(Trap::User(Box::new(Unimplemented(
-        "wasm atomics (fn wasmtime_memory_atomic_wait32) unsupported",
-    ))));
+    let result = {
+        let memory = MemoryIndex::from_u32(memory_index);
+        let instance = (*vmctx).instance();
+        // see wasmtime_memory_atomic_notify for why this shouldn't overflow
+        // but we still double-check
+        let addr_to_check = addr.checked_add(4).unwrap();
+        validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| {
+            Err(Trap::User(Box::new(Unimplemented(
+                "wasm atomics (fn wasmtime_memory_atomic_wait32) unsupported",
+            ))))
+        })
+    };
+    match result {
+        Ok(n) => n,
+        Err(e) => raise_lib_trap(e),
+    }
 }
 
 /// Implementation of `memory.atomic.wait64` for locally defined memories.
 pub unsafe extern "C" fn wasmtime_memory_atomic_wait64(
-    _vmctx: *mut VMContext,
-    _memory_index: u32,
-    _addr: u32,
+    vmctx: *mut VMContext,
+    memory_index: u32,
+    addr: usize,
     _expected: u64,
     _timeout: u64,
 ) -> u32 {
-    raise_lib_trap(Trap::User(Box::new(Unimplemented(
-        "wasm atomics (fn wasmtime_memory_atomic_wait32) unsupported",
-    ))));
+    let result = {
+        let memory = MemoryIndex::from_u32(memory_index);
+        let instance = (*vmctx).instance();
+        // see wasmtime_memory_atomic_notify for why this shouldn't overflow
+        // but we still double-check
+        let addr_to_check = addr.checked_add(8).unwrap();
+        validate_atomic_addr(instance, memory, addr_to_check).and_then(|()| {
+            Err(Trap::User(Box::new(Unimplemented(
+                "wasm atomics (fn wasmtime_memory_atomic_wait64) unsupported",
+            ))))
+        })
+    };
+    match result {
+        Ok(n) => n,
+        Err(e) => raise_lib_trap(e),
+    }
+}
+
+/// For atomic operations we still check the actual address despite this also
+/// being checked via the `heap_addr` instruction in cranelift. The reason for
+/// that is because the `heap_addr` instruction can defer to a later segfault to
+/// actually recognize the out-of-bounds whereas once we're running Rust code
+/// here we don't want to segfault.
+///
+/// In the situations where bounds checks were elided in JIT code (because oob
+/// would then be later guaranteed to segfault) this manual check is here
+/// so we don't segfault from Rust.
+unsafe fn validate_atomic_addr(
+    instance: &Instance,
+    memory: MemoryIndex,
+    addr: usize,
+) -> Result<(), Trap> {
+    if addr > instance.get_memory(memory).current_length {
+        return Err(Trap::Wasm {
+            trap_code: TrapCode::HeapOutOfBounds,
+            backtrace: Backtrace::new_unresolved(),
+        });
+    }
+    Ok(())
 }
 
 /// Hook for when an instance runs out of fuel.
diff --git a/tests/all/wast.rs b/tests/all/wast.rs
index 45e874da1f31..c4e10da30d07 100644
--- a/tests/all/wast.rs
+++ b/tests/all/wast.rs
@@ -25,6 +25,11 @@ fn run_wast(wast: &str, strategy: Strategy, pooling: bool) -> anyhow::Result<()>
     // by reference types.
     let reftypes = simd || wast.iter().any(|s| s == "reference-types");
 
+    // Threads aren't implemented in the old backend, so skip those tests.
+    if threads && cfg!(feature = "old-x86-backend") {
+        return Ok(());
+    }
+
     let mut cfg = Config::new();
     cfg.wasm_simd(simd)
         .wasm_bulk_memory(bulk_mem)
diff --git a/tests/misc_testsuite/threads/load-store-alignment.wast b/tests/misc_testsuite/threads/load-store-alignment.wast
new file mode 100644
index 000000000000..5d8fff8db7ff
--- /dev/null
+++ b/tests/misc_testsuite/threads/load-store-alignment.wast
@@ -0,0 +1,126 @@
+(module
+  ;; NB this should use a shared memory when it's supported
+  (memory 1)
+
+  (func (export "32.load8u") (param i32) (result i32)
+    local.get 0 i32.atomic.load8_u)
+  (func (export "32.load16u") (param i32) (result i32)
+    local.get 0 i32.atomic.load16_u)
+  (func (export "32.load32u") (param i32) (result i32)
+    local.get 0 i32.atomic.load)
+  (func (export "64.load8u") (param i32) (result i64)
+    local.get 0 i64.atomic.load8_u)
+  (func (export "64.load16u") (param i32) (result i64)
+    local.get 0 i64.atomic.load16_u)
+  (func (export "64.load32u") (param i32) (result i64)
+    local.get 0 i64.atomic.load32_u)
+  (func (export "64.load64u") (param i32) (result i64)
+    local.get 0 i64.atomic.load)
+
+  (func (export "32.store8") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store8)
+  (func (export "32.store16") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store16)
+  (func (export "32.store32") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store)
+  (func (export "64.store8") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store8)
+  (func (export "64.store16") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store16)
+  (func (export "64.store32") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store32)
+  (func (export "64.store64") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store)
+
+  (func (export "32.load8u o1") (param i32) (result i32)
+    local.get 0 i32.atomic.load8_u offset=1)
+  (func (export "32.load16u o1") (param i32) (result i32)
+    local.get 0 i32.atomic.load16_u offset=1)
+  (func (export "32.load32u o1") (param i32) (result i32)
+    local.get 0 i32.atomic.load offset=1)
+  (func (export "64.load8u o1") (param i32) (result i64)
+    local.get 0 i64.atomic.load8_u offset=1)
+  (func (export "64.load16u o1") (param i32) (result i64)
+    local.get 0 i64.atomic.load16_u offset=1)
+  (func (export "64.load32u o1") (param i32) (result i64)
+    local.get 0 i64.atomic.load32_u offset=1)
+  (func (export "64.load64u o1") (param i32) (result i64)
+    local.get 0 i64.atomic.load offset=1)
+
+  (func (export "32.store8 o1") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store8 offset=1)
+  (func (export "32.store16 o1") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store16 offset=1)
+  (func (export "32.store32 o1") (param i32)
+    local.get 0 i32.const 0 i32.atomic.store offset=1)
+  (func (export "64.store8 o1") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store8 offset=1)
+  (func (export "64.store16 o1") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store16 offset=1)
+  (func (export "64.store32 o1") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store32 offset=1)
+  (func (export "64.store64 o1") (param i32)
+    local.get 0 i64.const 0 i64.atomic.store offset=1)
+)
+
+;; aligned loads
+(assert_return (invoke "32.load8u" (i32.const 0)) (i32.const 0))
+(assert_return (invoke "32.load16u" (i32.const 0)) (i32.const 0))
+(assert_return (invoke "32.load32u" (i32.const 0)) (i32.const 0))
+(assert_return (invoke "64.load8u" (i32.const 0)) (i64.const 0))
+(assert_return (invoke "64.load16u" (i32.const 0)) (i64.const 0))
+(assert_return (invoke "64.load64u" (i32.const 0)) (i64.const 0))
+(assert_return (invoke "32.load8u o1" (i32.const 0)) (i32.const 0))
+(assert_return (invoke "32.load16u o1" (i32.const 1)) (i32.const 0))
+(assert_return (invoke "32.load32u o1" (i32.const 3)) (i32.const 0))
+(assert_return (invoke "64.load8u o1" (i32.const 0)) (i64.const 0))
+(assert_return (invoke "64.load16u o1" (i32.const 1)) (i64.const 0))
+(assert_return (invoke "64.load32u o1" (i32.const 3)) (i64.const 0))
+(assert_return (invoke "64.load64u o1" (i32.const 7)) (i64.const 0))
+
+;; misaligned loads
+(assert_return (invoke "32.load8u" (i32.const 1)) (i32.const 0))
+(assert_trap (invoke "32.load16u" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "32.load32u" (i32.const 1)) "misaligned memory access")
+(assert_return (invoke "64.load8u" (i32.const 1)) (i64.const 0))
+(assert_trap (invoke "64.load16u" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "64.load32u" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "64.load64u" (i32.const 1)) "misaligned memory access")
+(assert_return (invoke "32.load8u o1" (i32.const 0)) (i32.const 0))
+(assert_trap (invoke "32.load16u o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "32.load32u o1" (i32.const 0)) "misaligned memory access")
+(assert_return (invoke "64.load8u o1" (i32.const 0)) (i64.const 0))
+(assert_trap (invoke "64.load16u o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "64.load32u o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "64.load64u o1" (i32.const 0)) "misaligned memory access")
+
+;; aligned stores
+(assert_return (invoke "32.store8" (i32.const 0)))
+(assert_return (invoke "32.store16" (i32.const 0)))
+(assert_return (invoke "32.store32" (i32.const 0)))
+(assert_return (invoke "64.store8" (i32.const 0)))
+(assert_return (invoke "64.store16" (i32.const 0)))
+(assert_return (invoke "64.store64" (i32.const 0)))
+(assert_return (invoke "32.store8 o1" (i32.const 0)))
+(assert_return (invoke "32.store16 o1" (i32.const 1)))
+(assert_return (invoke "32.store32 o1" (i32.const 3)))
+(assert_return (invoke "64.store8 o1" (i32.const 0)))
+(assert_return (invoke "64.store16 o1" (i32.const 1)))
+(assert_return (invoke "64.store32 o1" (i32.const 3)))
+(assert_return (invoke "64.store64 o1" (i32.const 7)))
+
+;; misaligned stores
+(assert_return (invoke "32.store8" (i32.const 1)) (i32.const 0))
+(assert_trap (invoke "32.store16" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "32.store32" (i32.const 1)) "misaligned memory access")
+(assert_return (invoke "64.store8" (i32.const 1)) (i64.const 0))
+(assert_trap (invoke "64.store16" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "64.store32" (i32.const 1)) "misaligned memory access")
+(assert_trap (invoke "64.store64" (i32.const 1)) "misaligned memory access")
+(assert_return (invoke "32.store8 o1" (i32.const 0)) (i32.const 0))
+(assert_trap (invoke "32.store16 o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "32.store32 o1" (i32.const 0)) "misaligned memory access")
+(assert_return (invoke "64.store8 o1" (i32.const 0)) (i64.const 0))
+(assert_trap (invoke "64.store16 o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "64.store32 o1" (i32.const 0)) "misaligned memory access")
+(assert_trap (invoke "64.store64 o1" (i32.const 0)) "misaligned memory access")