diff --git a/Cargo.lock b/Cargo.lock
index f9ad78e37951d..1963f7c0d5662 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2524,14 +2524,12 @@ dependencies = [
  "aes",
  "colored",
  "ctrlc",
- "env_logger 0.10.2",
  "getrandom",
  "jemalloc-sys",
  "lazy_static",
  "libc",
  "libffi",
  "libloading",
- "log",
  "measureme",
  "rand",
  "regex",
diff --git a/src/tools/miri/CONTRIBUTING.md b/src/tools/miri/CONTRIBUTING.md
index 7a49ff3372f5f..f2f3a642e0a03 100644
--- a/src/tools/miri/CONTRIBUTING.md
+++ b/src/tools/miri/CONTRIBUTING.md
@@ -78,6 +78,8 @@ custom target file, you might have to set `MIRI_NO_STD=1`.
 base directory, e.g. `./miri test fail` will run all compile-fail tests). These filters are passed
 to `cargo test`, so for multiple filers you need to use `./miri test -- FILTER1 FILTER2`.
 
+#### Fine grained logging
+
 You can get a trace of which MIR statements are being executed by setting the
 `MIRI_LOG` environment variable.  For example:
 
@@ -94,9 +96,16 @@ stacked borrows implementation:
 MIRI_LOG=rustc_mir::interpret=info,miri::stacked_borrows ./miri run tests/pass/vec.rs
 ```
 
-In addition, you can set `MIRI_BACKTRACE=1` to get a backtrace of where an
+Note that you will only get `info`, `warn` or `error` messages if you use a prebuilt compiler.
+In order to get `debug` and `trace` level messages, you need to build miri with a locally built
+compiler that has `debug=true` set in `config.toml`.
+
+#### Debugging error messages
+
+You can set `MIRI_BACKTRACE=1` to get a backtrace of where an
 evaluation error was originally raised.
 
+
 ### UI testing
 
 We use ui-testing in Miri, meaning we generate `.stderr` and `.stdout` files for the output
diff --git a/src/tools/miri/Cargo.lock b/src/tools/miri/Cargo.lock
index 8cd996d85645b..87dc51bd61255 100644
--- a/src/tools/miri/Cargo.lock
+++ b/src/tools/miri/Cargo.lock
@@ -273,19 +273,6 @@ version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
 
-[[package]]
-name = "env_logger"
-version = "0.10.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95b3f3e67048839cb0d0781f445682a35113da7121f7c949db0e2be96a4fbece"
-dependencies = [
- "humantime",
- "is-terminal",
- "log",
- "regex",
- "termcolor",
-]
-
 [[package]]
 name = "errno"
 version = "0.3.8"
@@ -339,18 +326,6 @@ version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
 
-[[package]]
-name = "hermit-abi"
-version = "0.3.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
-
-[[package]]
-name = "humantime"
-version = "2.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
-
 [[package]]
 name = "indenter"
 version = "0.3.3"
@@ -388,17 +363,6 @@ dependencies = [
  "cfg-if",
 ]
 
-[[package]]
-name = "is-terminal"
-version = "0.4.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455"
-dependencies = [
- "hermit-abi",
- "rustix",
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.10"
@@ -529,14 +493,12 @@ dependencies = [
  "aes",
  "colored",
  "ctrlc",
- "env_logger",
  "getrandom",
  "jemalloc-sys",
  "lazy_static",
  "libc",
  "libffi",
  "libloading",
- "log",
  "measureme",
  "rand",
  "regex",
@@ -875,15 +837,6 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
-[[package]]
-name = "termcolor"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff1bc3d3f05aff0403e8ac0d92ced918ec05b666a43f83297ccef5bea8a3d449"
-dependencies = [
- "winapi-util",
-]
-
 [[package]]
 name = "thiserror"
 version = "1.0.56"
@@ -1034,15 +987,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
-[[package]]
-name = "winapi-util"
-version = "0.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
diff --git a/src/tools/miri/Cargo.toml b/src/tools/miri/Cargo.toml
index a65010b055b63..39122c847ce0d 100644
--- a/src/tools/miri/Cargo.toml
+++ b/src/tools/miri/Cargo.toml
@@ -19,8 +19,6 @@ doctest = false # and no doc tests
 
 [dependencies]
 getrandom = { version = "0.2", features = ["std"] }
-env_logger = "0.10"
-log = "0.4"
 rand = "0.8"
 smallvec = "1.7"
 aes = { version = "0.8.3", features = ["hazmat"] }
diff --git a/src/tools/miri/README.md b/src/tools/miri/README.md
index 6695f123c7836..60bf07b1736e4 100644
--- a/src/tools/miri/README.md
+++ b/src/tools/miri/README.md
@@ -108,11 +108,8 @@ assume the right toolchain is pinned via `rustup override set nightly` or
 
 Now you can run your project in Miri:
 
-1. Run `cargo clean` to eliminate any cached dependencies. Miri needs your
-   dependencies to be compiled the right way, that would not happen if they have
-   previously already been compiled.
-2. To run all tests in your project through Miri, use `cargo miri test`.
-3. If you have a binary project, you can run it through Miri using `cargo miri run`.
+- To run all tests in your project through Miri, use `cargo miri test`.
+- If you have a binary project, you can run it through Miri using `cargo miri run`.
 
 The first time you run Miri, it will perform some extra setup and install some
 dependencies. It will ask you for confirmation before installing anything.
diff --git a/src/tools/miri/ci/ci.sh b/src/tools/miri/ci/ci.sh
index 6bcc68ebf7ccd..9d2c3f362e6d1 100755
--- a/src/tools/miri/ci/ci.sh
+++ b/src/tools/miri/ci/ci.sh
@@ -121,8 +121,9 @@ case $HOST_TARGET in
     MIRI_TEST_TARGET=aarch64-apple-darwin run_tests
     MIRI_TEST_TARGET=i686-pc-windows-gnu run_tests
     # Some targets are only partially supported.
-    MIRI_TEST_TARGET=x86_64-unknown-freebsd run_tests_minimal hello integer vec panic/panic concurrency/simple pthread-threadname libc-getentropy libc-getrandom libc-misc libc-fs atomic env align
-    MIRI_TEST_TARGET=i686-unknown-freebsd run_tests_minimal hello integer vec panic/panic concurrency/simple pthread-threadname libc-getentropy libc-getrandom libc-misc libc-fs atomic env align
+    MIRI_TEST_TARGET=x86_64-unknown-freebsd run_tests_minimal hello integer vec panic/panic concurrency/simple pthread-threadname libc-getentropy libc-getrandom libc-misc libc-fs atomic env align num_cpus
+    MIRI_TEST_TARGET=i686-unknown-freebsd run_tests_minimal hello integer vec panic/panic concurrency/simple pthread-threadname libc-getentropy libc-getrandom libc-misc libc-fs atomic env align num_cpus
+
     MIRI_TEST_TARGET=aarch64-linux-android run_tests_minimal hello integer vec panic/panic
     MIRI_TEST_TARGET=wasm32-wasi run_tests_minimal no_std integer strings wasm
     MIRI_TEST_TARGET=wasm32-unknown-unknown run_tests_minimal no_std integer strings wasm
diff --git a/src/tools/miri/rust-version b/src/tools/miri/rust-version
index 6624672775f9d..ab6f899cd3a5e 100644
--- a/src/tools/miri/rust-version
+++ b/src/tools/miri/rust-version
@@ -1 +1 @@
-dd2559e08e1530806740931037d6bb83ef956161
+4316d0c6252cb1f833e582dfa68adb98efd5ddfb
diff --git a/src/tools/miri/src/bin/miri.rs b/src/tools/miri/src/bin/miri.rs
index 095ba17367135..db4c4a28debb4 100644
--- a/src/tools/miri/src/bin/miri.rs
+++ b/src/tools/miri/src/bin/miri.rs
@@ -5,7 +5,7 @@
     clippy::useless_format,
     clippy::field_reassign_with_default,
     rustc::diagnostic_outside_of_impl,
-    rustc::untranslatable_diagnostic,
+    rustc::untranslatable_diagnostic
 )]
 
 extern crate rustc_data_structures;
@@ -16,14 +16,14 @@ extern crate rustc_log;
 extern crate rustc_metadata;
 extern crate rustc_middle;
 extern crate rustc_session;
+#[macro_use]
+extern crate tracing;
 
 use std::env::{self, VarError};
 use std::num::NonZero;
 use std::path::PathBuf;
 use std::str::FromStr;
 
-use log::debug;
-
 use rustc_data_structures::sync::Lrc;
 use rustc_driver::Compilation;
 use rustc_hir::{self as hir, Node};
@@ -200,7 +200,7 @@ fn rustc_logger_config() -> rustc_log::LoggerConfig {
             // CTFE-related. Otherwise, we use it verbatim for `RUSTC_LOG`.
             // This way, if you set `MIRI_LOG=trace`, you get only the right parts of
             // rustc traced, but you can also do `MIRI_LOG=miri=trace,rustc_const_eval::interpret=debug`.
-            if log::Level::from_str(&var).is_ok() {
+            if tracing::Level::from_str(&var).is_ok() {
                 cfg.filter = Ok(format!(
                     "rustc_middle::mir::interpret={var},rustc_const_eval::interpret={var}"
                 ));
@@ -218,10 +218,6 @@ fn rustc_logger_config() -> rustc_log::LoggerConfig {
 }
 
 fn init_early_loggers(early_dcx: &EarlyDiagCtxt) {
-    // Note that our `extern crate log` is *not* the same as rustc's; as a result, we have to
-    // initialize them both, and we always initialize `miri`'s first.
-    let env = env_logger::Env::new().filter("MIRI_LOG").write_style("MIRI_LOG_STYLE");
-    env_logger::init_from_env(env);
     // Now for rustc. We only initialize `rustc` if the env var is set (so the user asked for it).
     // If it is not set, we avoid initializing now so that we can initialize later with our custom
     // settings, and *not* log anything for what happens before `miri` gets started.
diff --git a/src/tools/miri/src/borrow_tracker/mod.rs b/src/tools/miri/src/borrow_tracker/mod.rs
index 45240edea455d..711323b51c20d 100644
--- a/src/tools/miri/src/borrow_tracker/mod.rs
+++ b/src/tools/miri/src/borrow_tracker/mod.rs
@@ -2,7 +2,6 @@ use std::cell::RefCell;
 use std::fmt;
 use std::num::NonZero;
 
-use log::trace;
 use smallvec::SmallVec;
 
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
diff --git a/src/tools/miri/src/borrow_tracker/stacked_borrows/mod.rs b/src/tools/miri/src/borrow_tracker/stacked_borrows/mod.rs
index 7740d383ee3f2..0fe422180f76f 100644
--- a/src/tools/miri/src/borrow_tracker/stacked_borrows/mod.rs
+++ b/src/tools/miri/src/borrow_tracker/stacked_borrows/mod.rs
@@ -9,8 +9,6 @@ use std::cmp;
 use std::fmt::Write;
 use std::mem;
 
-use log::trace;
-
 use rustc_data_structures::fx::FxHashSet;
 use rustc_middle::mir::{Mutability, RetagKind};
 use rustc_middle::ty::{self, layout::HasParamEnv, Ty};
diff --git a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
index 291807c25eeb7..712c26a9afd7b 100644
--- a/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
+++ b/src/tools/miri/src/borrow_tracker/stacked_borrows/stack.rs
@@ -385,7 +385,7 @@ impl<'tcx> Stack {
             let upper = unique_range.end;
             for item in &mut self.borrows[lower..upper] {
                 if item.perm() == Permission::Unique {
-                    log::trace!("access: disabling item {:?}", item);
+                    trace!("access: disabling item {:?}", item);
                     visitor(*item)?;
                     item.set_permission(Permission::Disabled);
                     // Also update all copies of this item in the cache.
diff --git a/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs b/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs
index 0945a5292bb92..cc98286534107 100644
--- a/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs
+++ b/src/tools/miri/src/borrow_tracker/tree_borrows/mod.rs
@@ -1,5 +1,3 @@
-use log::trace;
-
 use rustc_target::abi::{Abi, Size};
 
 use crate::borrow_tracker::{AccessKind, GlobalState, GlobalStateInner, ProtectorKind};
diff --git a/src/tools/miri/src/concurrency/data_race.rs b/src/tools/miri/src/concurrency/data_race.rs
index 80d0402fc8758..a280448ae05d2 100644
--- a/src/tools/miri/src/concurrency/data_race.rs
+++ b/src/tools/miri/src/concurrency/data_race.rs
@@ -466,7 +466,7 @@ impl MemoryCellClocks {
         index: VectorIdx,
         access_size: Size,
     ) -> Result<(), DataRace> {
-        log::trace!("Atomic read with vectors: {:#?} :: {:#?}", self, thread_clocks);
+        trace!("Atomic read with vectors: {:#?} :: {:#?}", self, thread_clocks);
         let atomic = self.atomic_access(thread_clocks, access_size)?;
         atomic.read_vector.set_at_index(&thread_clocks.clock, index);
         // Make sure the last non-atomic write and all non-atomic reads were before this access.
@@ -485,7 +485,7 @@ impl MemoryCellClocks {
         index: VectorIdx,
         access_size: Size,
     ) -> Result<(), DataRace> {
-        log::trace!("Atomic write with vectors: {:#?} :: {:#?}", self, thread_clocks);
+        trace!("Atomic write with vectors: {:#?} :: {:#?}", self, thread_clocks);
         let atomic = self.atomic_access(thread_clocks, access_size)?;
         atomic.write_vector.set_at_index(&thread_clocks.clock, index);
         // Make sure the last non-atomic write and all non-atomic reads were before this access.
@@ -504,7 +504,7 @@ impl MemoryCellClocks {
         index: VectorIdx,
         current_span: Span,
     ) -> Result<(), DataRace> {
-        log::trace!("Unsynchronized read with vectors: {:#?} :: {:#?}", self, thread_clocks);
+        trace!("Unsynchronized read with vectors: {:#?} :: {:#?}", self, thread_clocks);
         if !current_span.is_dummy() {
             thread_clocks.clock[index].span = current_span;
         }
@@ -533,7 +533,7 @@ impl MemoryCellClocks {
         write_type: NaWriteType,
         current_span: Span,
     ) -> Result<(), DataRace> {
-        log::trace!("Unsynchronized write with vectors: {:#?} :: {:#?}", self, thread_clocks);
+        trace!("Unsynchronized write with vectors: {:#?} :: {:#?}", self, thread_clocks);
         if !current_span.is_dummy() {
             thread_clocks.clock[index].span = current_span;
         }
@@ -743,7 +743,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: MiriInterpCxExt<'mir, 'tcx> {
                 &this.machine.threads,
                 current_span,
                 |index, mut clocks| {
-                    log::trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
+                    trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
 
                     // Apply data-race detection for the current fences
                     // this treats AcqRel and SeqCst as the same as an acquire
@@ -841,7 +841,7 @@ impl VClockAlloc {
     // Find an index, if one exists where the value
     // in `l` is greater than the value in `r`.
     fn find_gt_index(l: &VClock, r: &VClock) -> Option<VectorIdx> {
-        log::trace!("Find index where not {:?} <= {:?}", l, r);
+        trace!("Find index where not {:?} <= {:?}", l, r);
         let l_slice = l.as_slice();
         let r_slice = r.as_slice();
         l_slice
@@ -1270,7 +1270,7 @@ trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriInterpCxExt<'mir, 'tcx> {
                 // Load and log the atomic operation.
                 // Note that atomic loads are possible even from read-only allocations, so `get_alloc_extra_mut` is not an option.
                 let alloc_meta = this.get_alloc_extra(alloc_id)?.data_race.as_ref().unwrap();
-                log::trace!(
+                trace!(
                     "Atomic op({}) with ordering {:?} on {:?} (size={})",
                     access.description(),
                     &atomic,
@@ -1311,11 +1311,11 @@ trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriInterpCxExt<'mir, 'tcx> {
                 )?;
 
                 // Log changes to atomic memory.
-                if log::log_enabled!(log::Level::Trace) {
+                if tracing::enabled!(tracing::Level::TRACE) {
                     for (_offset, mem_clocks) in
                         alloc_meta.alloc_ranges.borrow().iter(base_offset, size)
                     {
-                        log::trace!(
+                        trace!(
                             "Updated atomic memory({:?}, size={}) to {:#?}",
                             place.ptr(),
                             size.bytes(),
@@ -1530,7 +1530,7 @@ impl GlobalState {
             vector_info.push(thread)
         };
 
-        log::trace!("Creating thread = {:?} with vector index = {:?}", thread, created_index);
+        trace!("Creating thread = {:?} with vector index = {:?}", thread, created_index);
 
         // Mark the chosen vector index as in use by the thread.
         thread_info[thread].vector_index = Some(created_index);
diff --git a/src/tools/miri/src/concurrency/sync.rs b/src/tools/miri/src/concurrency/sync.rs
index b948ecb834539..956a02ded0f13 100644
--- a/src/tools/miri/src/concurrency/sync.rs
+++ b/src/tools/miri/src/concurrency/sync.rs
@@ -1,8 +1,6 @@
 use std::collections::{hash_map::Entry, VecDeque};
 use std::ops::Not;
 
-use log::trace;
-
 use rustc_data_structures::fx::FxHashMap;
 use rustc_index::{Idx, IndexVec};
 use rustc_middle::ty::layout::TyAndLayout;
@@ -71,7 +69,7 @@ struct Mutex {
     lock_count: usize,
     /// The queue of threads waiting for this mutex.
     queue: VecDeque<ThreadId>,
-    /// Data race handle, this tracks the happens-before
+    /// Data race handle. This tracks the happens-before
     /// relationship between each mutex access. It is
     /// released to during unlock and acquired from during
     /// locking, and therefore stores the clock of the last
@@ -93,7 +91,7 @@ struct RwLock {
     writer_queue: VecDeque<ThreadId>,
     /// The queue of reader threads waiting for this lock.
     reader_queue: VecDeque<ThreadId>,
-    /// Data race handle for writers, tracks the happens-before
+    /// Data race handle for writers. Tracks the happens-before
     /// ordering between each write access to a rwlock and is updated
     /// after a sequence of concurrent readers to track the happens-
     /// before ordering between the set of previous readers and
@@ -102,7 +100,7 @@ struct RwLock {
     /// lock or the joined clock of the set of last threads to release
     /// shared reader locks.
     data_race: VClock,
-    /// Data race handle for readers, this is temporary storage
+    /// Data race handle for readers. This is temporary storage
     /// for the combined happens-before ordering for between all
     /// concurrent readers and the next writer, and the value
     /// is stored to the main data_race variable once all
@@ -111,6 +109,7 @@ struct RwLock {
     /// must load the clock of the last write and must not
     /// add happens-before orderings between shared reader
     /// locks.
+    /// This is only relevant when there is an active reader.
     data_race_reader: VClock,
 }
 
@@ -486,6 +485,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
             Entry::Vacant(_) => return false, // we did not even own this lock
         }
         if let Some(data_race) = &this.machine.data_race {
+            // Add this to the shared-release clock of all concurrent readers.
             data_race.validate_lock_release_shared(
                 &mut rwlock.data_race_reader,
                 reader,
@@ -540,20 +540,13 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
             }
             rwlock.writer = None;
             trace!("rwlock_writer_unlock: {:?} unlocked by {:?}", id, expected_writer);
-            // Release memory to both reader and writer vector clocks
-            //  since this writer happens-before both the union of readers once they are finished
-            //  and the next writer
+            // Release memory to next lock holder.
             if let Some(data_race) = &this.machine.data_race {
                 data_race.validate_lock_release(
                     &mut rwlock.data_race,
                     current_writer,
                     current_span,
                 );
-                data_race.validate_lock_release(
-                    &mut rwlock.data_race_reader,
-                    current_writer,
-                    current_span,
-                );
             }
             // The thread was a writer.
             //
diff --git a/src/tools/miri/src/concurrency/thread.rs b/src/tools/miri/src/concurrency/thread.rs
index 754cfa4d2a827..64e1f3c5b5574 100644
--- a/src/tools/miri/src/concurrency/thread.rs
+++ b/src/tools/miri/src/concurrency/thread.rs
@@ -8,7 +8,6 @@ use std::task::Poll;
 use std::time::{Duration, SystemTime};
 
 use either::Either;
-use log::trace;
 
 use rustc_data_structures::fx::FxHashMap;
 use rustc_hir::def_id::DefId;
diff --git a/src/tools/miri/src/diagnostics.rs b/src/tools/miri/src/diagnostics.rs
index 19b29a4181984..d47f446716b7d 100644
--- a/src/tools/miri/src/diagnostics.rs
+++ b/src/tools/miri/src/diagnostics.rs
@@ -1,8 +1,6 @@
 use std::fmt::{self, Write};
 use std::num::NonZero;
 
-use log::trace;
-
 use rustc_errors::{DiagnosticBuilder, DiagnosticMessage, Level};
 use rustc_span::{SpanData, Symbol, DUMMY_SP};
 use rustc_target::abi::{Align, Size};
@@ -102,10 +100,7 @@ impl MachineStopType for TerminationInfo {
     }
     fn add_args(
         self: Box<Self>,
-        _: &mut dyn FnMut(
-            std::borrow::Cow<'static, str>,
-            rustc_errors::DiagnosticArgValue,
-        ),
+        _: &mut dyn FnMut(std::borrow::Cow<'static, str>, rustc_errors::DiagnosticArgValue),
     ) {
     }
 }
@@ -290,7 +285,10 @@ pub fn report_error<'tcx, 'mir>(
                 ) =>
             {
                 ecx.handle_ice(); // print interpreter backtrace
-                bug!("This validation error should be impossible in Miri: {}", format_interp_error(ecx.tcx.dcx(), e));
+                bug!(
+                    "This validation error should be impossible in Miri: {}",
+                    format_interp_error(ecx.tcx.dcx(), e)
+                );
             }
             UndefinedBehavior(_) => "Undefined Behavior",
             ResourceExhaustion(_) => "resource exhaustion",
@@ -304,7 +302,10 @@ pub fn report_error<'tcx, 'mir>(
             ) => "post-monomorphization error",
             _ => {
                 ecx.handle_ice(); // print interpreter backtrace
-                bug!("This error should be impossible in Miri: {}", format_interp_error(ecx.tcx.dcx(), e));
+                bug!(
+                    "This error should be impossible in Miri: {}",
+                    format_interp_error(ecx.tcx.dcx(), e)
+                );
             }
         };
         #[rustfmt::skip]
diff --git a/src/tools/miri/src/eval.rs b/src/tools/miri/src/eval.rs
index 6095b8842eb60..9bab9488e378e 100644
--- a/src/tools/miri/src/eval.rs
+++ b/src/tools/miri/src/eval.rs
@@ -7,9 +7,6 @@ use std::path::PathBuf;
 use std::task::Poll;
 use std::thread;
 
-use log::info;
-use rustc_middle::ty::Ty;
-
 use crate::concurrency::thread::TlsAllocAction;
 use crate::diagnostics::report_leaks;
 use rustc_data_structures::fx::FxHashSet;
@@ -18,7 +15,7 @@ use rustc_hir::def_id::DefId;
 use rustc_middle::ty::{
     self,
     layout::{LayoutCx, LayoutOf},
-    TyCtxt,
+    Ty, TyCtxt,
 };
 use rustc_target::spec::abi::Abi;
 
diff --git a/src/tools/miri/src/helpers.rs b/src/tools/miri/src/helpers.rs
index 3cee4df588542..d9b4363d604b4 100644
--- a/src/tools/miri/src/helpers.rs
+++ b/src/tools/miri/src/helpers.rs
@@ -3,8 +3,6 @@ use std::iter;
 use std::num::NonZero;
 use std::time::Duration;
 
-use log::trace;
-
 use rustc_apfloat::ieee::{Double, Single};
 use rustc_apfloat::Float;
 use rustc_hir::def::{DefKind, Namespace};
diff --git a/src/tools/miri/src/intptrcast.rs b/src/tools/miri/src/intptrcast.rs
index 68c9a7660ebda..3fe127f973269 100644
--- a/src/tools/miri/src/intptrcast.rs
+++ b/src/tools/miri/src/intptrcast.rs
@@ -2,7 +2,6 @@ use std::cell::RefCell;
 use std::cmp::max;
 use std::collections::hash_map::Entry;
 
-use log::trace;
 use rand::Rng;
 
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
diff --git a/src/tools/miri/src/lib.rs b/src/tools/miri/src/lib.rs
index 305c71fb0f945..c567949102f63 100644
--- a/src/tools/miri/src/lib.rs
+++ b/src/tools/miri/src/lib.rs
@@ -63,6 +63,8 @@ extern crate rustc_middle;
 extern crate rustc_session;
 extern crate rustc_span;
 extern crate rustc_target;
+#[macro_use]
+extern crate tracing;
 
 // Necessary to pull in object code as the rest of the rustc crates are shipped only as rmeta
 // files.
diff --git a/src/tools/miri/src/operator.rs b/src/tools/miri/src/operator.rs
index 6f19dead2e9bf..d99be39177bcf 100644
--- a/src/tools/miri/src/operator.rs
+++ b/src/tools/miri/src/operator.rs
@@ -1,7 +1,5 @@
 use std::iter;
 
-use log::trace;
-
 use rand::{seq::IteratorRandom, Rng};
 use rustc_apfloat::{Float, FloatConvert};
 use rustc_middle::mir;
diff --git a/src/tools/miri/src/shims/foreign_items.rs b/src/tools/miri/src/shims/foreign_items.rs
index bf90d1468bb0e..0645c1f176ef7 100644
--- a/src/tools/miri/src/shims/foreign_items.rs
+++ b/src/tools/miri/src/shims/foreign_items.rs
@@ -1,7 +1,5 @@
 use std::{collections::hash_map::Entry, io::Write, iter, path::Path};
 
-use log::trace;
-
 use rustc_apfloat::Float;
 use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_hir::{
diff --git a/src/tools/miri/src/shims/intrinsics/mod.rs b/src/tools/miri/src/shims/intrinsics/mod.rs
index df2761bfaf425..602e8b31b0103 100644
--- a/src/tools/miri/src/shims/intrinsics/mod.rs
+++ b/src/tools/miri/src/shims/intrinsics/mod.rs
@@ -3,8 +3,6 @@ mod simd;
 
 use std::iter;
 
-use log::trace;
-
 use rand::Rng;
 use rustc_apfloat::{Float, Round};
 use rustc_middle::ty::layout::LayoutOf;
diff --git a/src/tools/miri/src/shims/panic.rs b/src/tools/miri/src/shims/panic.rs
index 28652c25c2450..4c054d8dc8aea 100644
--- a/src/tools/miri/src/shims/panic.rs
+++ b/src/tools/miri/src/shims/panic.rs
@@ -11,8 +11,6 @@
 //!   gets popped *during unwinding*, we take the panic payload and store it according to the extra
 //!   metadata we remembered when pushing said frame.
 
-use log::trace;
-
 use rustc_ast::Mutability;
 use rustc_middle::{mir, ty};
 use rustc_span::Symbol;
diff --git a/src/tools/miri/src/shims/tls.rs b/src/tools/miri/src/shims/tls.rs
index b319516c25b9e..84c1feb88e959 100644
--- a/src/tools/miri/src/shims/tls.rs
+++ b/src/tools/miri/src/shims/tls.rs
@@ -4,8 +4,6 @@ use std::collections::btree_map::Entry as BTreeEntry;
 use std::collections::BTreeMap;
 use std::task::Poll;
 
-use log::trace;
-
 use rustc_middle::ty;
 use rustc_target::abi::{HasDataLayout, Size};
 use rustc_target::spec::abi::Abi;
diff --git a/src/tools/miri/src/shims/unix/foreign_items.rs b/src/tools/miri/src/shims/unix/foreign_items.rs
index 35036ce078d37..b5cd18396a286 100644
--- a/src/tools/miri/src/shims/unix/foreign_items.rs
+++ b/src/tools/miri/src/shims/unix/foreign_items.rs
@@ -1,8 +1,6 @@
 use std::ffi::OsStr;
 use std::str;
 
-use log::trace;
-
 use rustc_middle::ty::layout::LayoutOf;
 use rustc_span::Symbol;
 use rustc_target::abi::{Align, Size};
@@ -262,6 +260,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
 
             "mmap" => {
                 let [addr, length, prot, flags, fd, offset] = this.check_shim(abi, Abi::C {unwind: false}, link_name, args)?;
+                let offset = this.read_scalar(offset)?.to_int(this.libc_ty_layout("off_t").size)?;
                 let ptr = this.mmap(addr, length, prot, flags, fd, offset)?;
                 this.write_scalar(ptr, dest)?;
             }
@@ -711,6 +710,25 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 }
             }
 
+            "sched_getaffinity" => {
+                // FreeBSD supports it as well since 13.1 (as a wrapper of cpuset_getaffinity)
+                if !matches!(&*this.tcx.sess.target.os, "linux" | "freebsd") {
+                    throw_unsup_format!(
+                        "`sched_getaffinity` is not supported on {}",
+                        this.tcx.sess.target.os
+                    );
+                }
+                let [pid, cpusetsize, mask] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                this.read_scalar(pid)?.to_i32()?;
+                this.read_target_usize(cpusetsize)?;
+                this.deref_pointer_as(mask, this.libc_ty_layout("cpu_set_t"))?;
+                // FIXME: we just return an error; `num_cpus` then falls back to `sysconf`.
+                let einval = this.eval_libc("EINVAL");
+                this.set_last_error(einval)?;
+                this.write_scalar(Scalar::from_i32(-1), dest)?;
+            }
+
             // Platform-specific shims
             _ => {
                 let target_os = &*this.tcx.sess.target.os;
diff --git a/src/tools/miri/src/shims/unix/fs.rs b/src/tools/miri/src/shims/unix/fs.rs
index 53f975baa89a5..b141ca4a019c3 100644
--- a/src/tools/miri/src/shims/unix/fs.rs
+++ b/src/tools/miri/src/shims/unix/fs.rs
@@ -8,8 +8,6 @@ use std::io::{self, ErrorKind, IsTerminal, Read, Seek, SeekFrom, Write};
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 
-use log::trace;
-
 use rustc_data_structures::fx::FxHashMap;
 use rustc_middle::ty::TyCtxt;
 use rustc_target::abi::Size;
diff --git a/src/tools/miri/src/shims/unix/linux/foreign_items.rs b/src/tools/miri/src/shims/unix/linux/foreign_items.rs
index 6937e0f089ec8..b9215129674b0 100644
--- a/src/tools/miri/src/shims/unix/linux/foreign_items.rs
+++ b/src/tools/miri/src/shims/unix/linux/foreign_items.rs
@@ -9,6 +9,7 @@ use shims::unix::fs::EvalContextExt as _;
 use shims::unix::linux::fd::EvalContextExt as _;
 use shims::unix::linux::mem::EvalContextExt as _;
 use shims::unix::linux::sync::futex;
+use shims::unix::mem::EvalContextExt as _;
 use shims::unix::sync::EvalContextExt as _;
 use shims::unix::thread::EvalContextExt as _;
 
@@ -43,6 +44,14 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                 let result = this.linux_readdir64(dirp)?;
                 this.write_scalar(result, dest)?;
             }
+            "mmap64" => {
+                let [addr, length, prot, flags, fd, offset] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let offset = this.read_scalar(offset)?.to_i64()?;
+                let ptr = this.mmap(addr, length, prot, flags, fd, offset.into())?;
+                this.write_scalar(ptr, dest)?;
+            }
+
             // Linux-only
             "sync_file_range" => {
                 let [fd, offset, nbytes, flags] =
@@ -197,17 +206,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
                     this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
                 getrandom(this, ptr, len, flags, dest)?;
             }
-            "sched_getaffinity" => {
-                let [pid, cpusetsize, mask] =
-                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
-                this.read_scalar(pid)?.to_i32()?;
-                this.read_target_usize(cpusetsize)?;
-                this.deref_pointer_as(mask, this.libc_ty_layout("cpu_set_t"))?;
-                // FIXME: we just return an error; `num_cpus` then falls back to `sysconf`.
-                let einval = this.eval_libc("EINVAL");
-                this.set_last_error(einval)?;
-                this.write_scalar(Scalar::from_i32(-1), dest)?;
-            }
 
             // Incomplete shims that we "stub out" just to get pre-main initialization code to work.
             // These shims are enabled only when the caller is in the standard library.
diff --git a/src/tools/miri/src/shims/unix/mem.rs b/src/tools/miri/src/shims/unix/mem.rs
index d7dc17fa89f13..d3470893dbbca 100644
--- a/src/tools/miri/src/shims/unix/mem.rs
+++ b/src/tools/miri/src/shims/unix/mem.rs
@@ -26,7 +26,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         prot: &OpTy<'tcx, Provenance>,
         flags: &OpTy<'tcx, Provenance>,
         fd: &OpTy<'tcx, Provenance>,
-        offset: &OpTy<'tcx, Provenance>,
+        offset: i128,
     ) -> InterpResult<'tcx, Scalar<Provenance>> {
         let this = self.eval_context_mut();
 
@@ -36,7 +36,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
         let prot = this.read_scalar(prot)?.to_i32()?;
         let flags = this.read_scalar(flags)?.to_i32()?;
         let fd = this.read_scalar(fd)?.to_i32()?;
-        let offset = this.read_target_usize(offset)?;
 
         let map_private = this.eval_libc_i32("MAP_PRIVATE");
         let map_anonymous = this.eval_libc_i32("MAP_ANONYMOUS");
diff --git a/src/tools/miri/src/shims/x86/avx.rs b/src/tools/miri/src/shims/x86/avx.rs
new file mode 100644
index 0000000000000..65de1607595be
--- /dev/null
+++ b/src/tools/miri/src/shims/x86/avx.rs
@@ -0,0 +1,417 @@
+use rustc_apfloat::{ieee::Double, ieee::Single};
+use rustc_middle::mir;
+use rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::ty::Ty;
+use rustc_span::Symbol;
+use rustc_target::spec::abi::Abi;
+
+use super::{
+    bin_op_simd_float_all, conditional_dot_product, convert_float_to_int, horizontal_bin_op,
+    round_all, test_bits_masked, test_high_bits_masked, unary_op_ps, FloatBinOp, FloatUnaryOp,
+};
+use crate::*;
+use shims::foreign_items::EmulateForeignItemResult;
+
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for crate::MiriInterpCx<'mir, 'tcx> {}
+pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
+    crate::MiriInterpCxExt<'mir, 'tcx>
+{
+    fn emulate_x86_avx_intrinsic(
+        &mut self,
+        link_name: Symbol,
+        abi: Abi,
+        args: &[OpTy<'tcx, Provenance>],
+        dest: &PlaceTy<'tcx, Provenance>,
+    ) -> InterpResult<'tcx, EmulateForeignItemResult> {
+        let this = self.eval_context_mut();
+        this.expect_target_feature_for_intrinsic(link_name, "avx")?;
+        // Prefix should have already been checked.
+        let unprefixed_name = link_name.as_str().strip_prefix("llvm.x86.avx.").unwrap();
+
+        match unprefixed_name {
+            // Used to implement _mm256_min_ps and _mm256_max_ps functions.
+            // Note that the semantics are a bit different from Rust simd_min
+            // and simd_max intrinsics regarding handling of NaN and -0.0: Rust
+            // matches the IEEE min/max operations, while x86 has different
+            // semantics.
+            "min.ps.256" | "max.ps.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "min.ps.256" => FloatBinOp::Min,
+                    "max.ps.256" => FloatBinOp::Max,
+                    _ => unreachable!(),
+                };
+
+                bin_op_simd_float_all::<Single>(this, which, left, right, dest)?;
+            }
+            // Used to implement _mm256_min_pd and _mm256_max_pd functions.
+            "min.pd.256" | "max.pd.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "min.pd.256" => FloatBinOp::Min,
+                    "max.pd.256" => FloatBinOp::Max,
+                    _ => unreachable!(),
+                };
+
+                bin_op_simd_float_all::<Double>(this, which, left, right, dest)?;
+            }
+            // Used to implement the _mm256_round_ps function.
+            // Rounds the elements of `op` according to `rounding`.
+            "round.ps.256" => {
+                let [op, rounding] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                round_all::<rustc_apfloat::ieee::Single>(this, op, rounding, dest)?;
+            }
+            // Used to implement the _mm256_round_pd function.
+            // Rounds the elements of `op` according to `rounding`.
+            "round.pd.256" => {
+                let [op, rounding] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                round_all::<rustc_apfloat::ieee::Double>(this, op, rounding, dest)?;
+            }
+            // Used to implement _mm256_{sqrt,rcp,rsqrt}_ps functions.
+            // Performs the operations on all components of `op`.
+            "sqrt.ps.256" | "rcp.ps.256" | "rsqrt.ps.256" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "sqrt.ps.256" => FloatUnaryOp::Sqrt,
+                    "rcp.ps.256" => FloatUnaryOp::Rcp,
+                    "rsqrt.ps.256" => FloatUnaryOp::Rsqrt,
+                    _ => unreachable!(),
+                };
+
+                unary_op_ps(this, which, op, dest)?;
+            }
+            // Used to implement the _mm256_dp_ps function.
+            "dp.ps.256" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                conditional_dot_product(this, left, right, imm, dest)?;
+            }
+            // Used to implement the _mm256_h{add,sub}_p{s,d} functions.
+            // Horizontally add/subtract adjacent floating point values
+            // in `left` and `right`.
+            "hadd.ps.256" | "hadd.pd.256" | "hsub.ps.256" | "hsub.pd.256" => {
+                let [left, right] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which = match unprefixed_name {
+                    "hadd.ps.256" | "hadd.pd.256" => mir::BinOp::Add,
+                    "hsub.ps.256" | "hsub.pd.256" => mir::BinOp::Sub,
+                    _ => unreachable!(),
+                };
+
+                horizontal_bin_op(this, which, /*saturating*/ false, left, right, dest)?;
+            }
+            // Used to implement the _mm256_cmp_ps function.
+            // Performs a comparison operation on each component of `left`
+            // and `right`. For each component, returns 0 if false or u32::MAX
+            // if true.
+            "cmp.ps.256" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which =
+                    FloatBinOp::cmp_from_imm(this, this.read_scalar(imm)?.to_i8()?, link_name)?;
+
+                bin_op_simd_float_all::<Single>(this, which, left, right, dest)?;
+            }
+            // Used to implement the _mm256_cmp_pd function.
+            // Performs a comparison operation on each component of `left`
+            // and `right`. For each component, returns 0 if false or u64::MAX
+            // if true.
+            "cmp.pd.256" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let which =
+                    FloatBinOp::cmp_from_imm(this, this.read_scalar(imm)?.to_i8()?, link_name)?;
+
+                bin_op_simd_float_all::<Double>(this, which, left, right, dest)?;
+            }
+            // Used to implement the _mm256_cvtps_epi32, _mm256_cvttps_epi32, _mm256_cvtpd_epi32
+            // and _mm256_cvttpd_epi32 functions.
+            // Converts packed f32/f64 to packed i32.
+            "cvt.ps2dq.256" | "cvtt.ps2dq.256" | "cvt.pd2dq.256" | "cvtt.pd2dq.256" => {
+                let [op] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let rnd = match unprefixed_name {
+                    // "current SSE rounding mode", assume nearest
+                    "cvt.ps2dq.256" | "cvt.pd2dq.256" => rustc_apfloat::Round::NearestTiesToEven,
+                    // always truncate
+                    "cvtt.ps2dq.256" | "cvtt.pd2dq.256" => rustc_apfloat::Round::TowardZero,
+                    _ => unreachable!(),
+                };
+
+                convert_float_to_int(this, op, rnd, dest)?;
+            }
+            // Used to implement the _mm_permutevar_ps and _mm256_permutevar_ps functions.
+            // Shuffles 32-bit floats from `data` using `control` as control. Each 128-bit
+            // chunk is shuffled independently: this means that we view the vector as a
+            // sequence of 4-element arrays, and we shuffle each of these arrays, where
+            // `control` determines which element of the current `data` array is written.
+            "vpermilvar.ps" | "vpermilvar.ps.256" => {
+                let [data, control] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (data, data_len) = this.operand_to_simd(data)?;
+                let (control, control_len) = this.operand_to_simd(control)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, data_len);
+                assert_eq!(dest_len, control_len);
+
+                for i in 0..dest_len {
+                    let control = this.project_index(&control, i)?;
+
+                    // Each 128-bit chunk is shuffled independently. Since each chunk contains
+                    // four 32-bit elements, only two bits from `control` are used. To read the
+                    // value from the current chunk, add the destination index truncated to a multiple
+                    // of 4.
+                    let chunk_base = i & !0b11;
+                    let src_i = u64::from(this.read_scalar(&control)?.to_u32()? & 0b11)
+                        .checked_add(chunk_base)
+                        .unwrap();
+
+                    this.copy_op(
+                        &this.project_index(&data, src_i)?,
+                        &this.project_index(&dest, i)?,
+                    )?;
+                }
+            }
+            // Used to implement the _mm_permutevar_pd and _mm256_permutevar_pd functions.
+            // Shuffles 64-bit floats from `left` using `right` as control. Each 128-bit
+            // chunk is shuffled independently: this means that we view the vector as
+            // a sequence of 2-element arrays, and we shuffle each of these arrays,
+            // where `right` determines which element of the current `left` array is
+            // written.
+            "vpermilvar.pd" | "vpermilvar.pd.256" => {
+                let [data, control] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (data, data_len) = this.operand_to_simd(data)?;
+                let (control, control_len) = this.operand_to_simd(control)?;
+                let (dest, dest_len) = this.place_to_simd(dest)?;
+
+                assert_eq!(dest_len, data_len);
+                assert_eq!(dest_len, control_len);
+
+                for i in 0..dest_len {
+                    let control = this.project_index(&control, i)?;
+
+                    // Each 128-bit chunk is shuffled independently. Since each chunk contains
+                    // two 64-bit elements, only the second bit from `control` is used (yes, the
+                    // second instead of the first, ask Intel). To read the value from the current
+                    // chunk, add the destination index truncated to a multiple of 2.
+                    let chunk_base = i & !1;
+                    let src_i = ((this.read_scalar(&control)?.to_u64()? >> 1) & 1)
+                        .checked_add(chunk_base)
+                        .unwrap();
+
+                    this.copy_op(
+                        &this.project_index(&data, src_i)?,
+                        &this.project_index(&dest, i)?,
+                    )?;
+                }
+            }
+            // Used to implement the _mm256_permute2f128_ps, _mm256_permute2f128_pd and
+            // _mm256_permute2f128_si256 functions. Regardless of the suffix in the name
+            // thay all can be considered to operate on vectors of 128-bit elements.
+            // For each 128-bit element of `dest`, copies one from `left`, `right` or
+            // zero, according to `imm`.
+            "vperm2f128.ps.256" | "vperm2f128.pd.256" | "vperm2f128.si.256" => {
+                let [left, right, imm] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                assert_eq!(dest.layout, left.layout);
+                assert_eq!(dest.layout, right.layout);
+                assert_eq!(dest.layout.size.bits(), 256);
+
+                // Transmute to `[u128; 2]` to process each 128-bit chunk independently.
+                let u128x2_layout =
+                    this.layout_of(Ty::new_array(this.tcx.tcx, this.tcx.types.u128, 2))?;
+                let left = left.transmute(u128x2_layout, this)?;
+                let right = right.transmute(u128x2_layout, this)?;
+                let dest = dest.transmute(u128x2_layout, this)?;
+
+                let imm = this.read_scalar(imm)?.to_u8()?;
+
+                for i in 0..2 {
+                    let dest = this.project_index(&dest, i)?;
+
+                    let imm = match i {
+                        0 => imm & 0xF,
+                        1 => imm >> 4,
+                        _ => unreachable!(),
+                    };
+                    if imm & 0b100 != 0 {
+                        this.write_scalar(Scalar::from_u128(0), &dest)?;
+                    } else {
+                        let src = match imm {
+                            0b00 => this.project_index(&left, 0)?,
+                            0b01 => this.project_index(&left, 1)?,
+                            0b10 => this.project_index(&right, 0)?,
+                            0b11 => this.project_index(&right, 1)?,
+                            _ => unreachable!(),
+                        };
+                        this.copy_op(&src, &dest)?;
+                    }
+                }
+            }
+            // Used to implement the _mm_maskload_ps, _mm_maskload_pd, _mm256_maskload_ps
+            // and _mm256_maskload_pd functions.
+            // For the element `i`, if the high bit of the `i`-th element of `mask`
+            // is one, it is loaded from `ptr.wrapping_add(i)`, otherwise zero is
+            // loaded.
+            "maskload.ps" | "maskload.pd" | "maskload.ps.256" | "maskload.pd.256" => {
+                let [ptr, mask] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_load(this, ptr, mask, dest)?;
+            }
+            // Used to implement the _mm_maskstore_ps, _mm_maskstore_pd, _mm256_maskstore_ps
+            // and _mm256_maskstore_pd functions.
+            // For the element `i`, if the high bit of the element `i`-th of `mask`
+            // is one, it is stored into `ptr.wapping_add(i)`.
+            // Unlike SSE2's _mm_maskmoveu_si128, these are not non-temporal stores.
+            "maskstore.ps" | "maskstore.pd" | "maskstore.ps.256" | "maskstore.pd.256" => {
+                let [ptr, mask, value] =
+                    this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                mask_store(this, ptr, mask, value)?;
+            }
+            // Used to implement the _mm256_lddqu_si256 function.
+            // Reads a 256-bit vector from an unaligned pointer. This intrinsic
+            // is expected to perform better than a regular unaligned read when
+            // the data crosses a cache line, but for Miri this is just a regular
+            // unaligned read.
+            "ldu.dq.256" => {
+                let [src_ptr] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+                let src_ptr = this.read_pointer(src_ptr)?;
+                let dest = dest.force_mplace(this)?;
+
+                // Unaligned copy, which is what we want.
+                this.mem_copy(src_ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
+            }
+            // Used to implement the _mm256_testz_si256, _mm256_testc_si256 and
+            // _mm256_testnzc_si256 functions.
+            // Tests `op & mask == 0`, `op & mask == mask` or
+            // `op & mask != 0 && op & mask != mask`
+            "ptestz.256" | "ptestc.256" | "ptestnzc.256" => {
+                let [op, mask] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (all_zero, masked_set) = test_bits_masked(this, op, mask)?;
+                let res = match unprefixed_name {
+                    "ptestz.256" => all_zero,
+                    "ptestc.256" => masked_set,
+                    "ptestnzc.256" => !all_zero && !masked_set,
+                    _ => unreachable!(),
+                };
+
+                this.write_scalar(Scalar::from_i32(res.into()), dest)?;
+            }
+            // Used to implement the _mm256_testz_pd, _mm256_testc_pd, _mm256_testnzc_pd
+            // _mm_testz_pd, _mm_testc_pd, _mm_testnzc_pd, _mm256_testz_ps,
+            // _mm256_testc_ps, _mm256_testnzc_ps, _mm_testz_ps, _mm_testc_ps and
+            // _mm_testnzc_ps functions.
+            // Calculates two booleans:
+            // `direct`, which is true when the highest bit of each element of `op & mask` is zero.
+            // `negated`, which is true when the highest bit of each element of `!op & mask` is zero.
+            // Return `direct` (testz), `negated` (testc) or `!direct & !negated` (testnzc)
+            "vtestz.pd.256" | "vtestc.pd.256" | "vtestnzc.pd.256" | "vtestz.pd" | "vtestc.pd"
+            | "vtestnzc.pd" | "vtestz.ps.256" | "vtestc.ps.256" | "vtestnzc.ps.256"
+            | "vtestz.ps" | "vtestc.ps" | "vtestnzc.ps" => {
+                let [op, mask] = this.check_shim(abi, Abi::C { unwind: false }, link_name, args)?;
+
+                let (direct, negated) = test_high_bits_masked(this, op, mask)?;
+                let res = match unprefixed_name {
+                    "vtestz.pd.256" | "vtestz.pd" | "vtestz.ps.256" | "vtestz.ps" => direct,
+                    "vtestc.pd.256" | "vtestc.pd" | "vtestc.ps.256" | "vtestc.ps" => negated,
+                    "vtestnzc.pd.256" | "vtestnzc.pd" | "vtestnzc.ps.256" | "vtestnzc.ps" =>
+                        !direct && !negated,
+                    _ => unreachable!(),
+                };
+
+                this.write_scalar(Scalar::from_i32(res.into()), dest)?;
+            }
+            _ => return Ok(EmulateForeignItemResult::NotSupported),
+        }
+        Ok(EmulateForeignItemResult::NeedsJumping)
+    }
+}
+
+/// Conditionally loads from `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_load<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    dest: &PlaceTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (dest, dest_len) = this.place_to_simd(dest)?;
+
+    assert_eq!(dest_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..dest_len {
+        let mask = this.project_index(&mask, i)?;
+        let dest = this.project_index(&dest, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(dest.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(ptr, dest.ptr(), dest.layout.size, /*nonoverlapping*/ true)?;
+        } else {
+            this.write_scalar(Scalar::from_int(0, dest.layout.size), &dest)?;
+        }
+    }
+
+    Ok(())
+}
+
+/// Conditionally stores into `ptr` according the high bit of each
+/// element of `mask`. `ptr` does not need to be aligned.
+fn mask_store<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    ptr: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+    value: &OpTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, ()> {
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+    let (value, value_len) = this.operand_to_simd(value)?;
+
+    assert_eq!(value_len, mask_len);
+
+    let mask_item_size = mask.layout.field(this, 0).size;
+    let high_bit_offset = mask_item_size.bits().checked_sub(1).unwrap();
+
+    let ptr = this.read_pointer(ptr)?;
+    for i in 0..value_len {
+        let mask = this.project_index(&mask, i)?;
+        let value = this.project_index(&value, i)?;
+
+        if this.read_scalar(&mask)?.to_uint(mask_item_size)? >> high_bit_offset != 0 {
+            // Size * u64 is implemented as always checked
+            #[allow(clippy::arithmetic_side_effects)]
+            let ptr = ptr.wrapping_offset(value.layout.size * i, &this.tcx);
+            // Unaligned copy, which is what we want.
+            this.mem_copy(value.ptr(), ptr, value.layout.size, /*nonoverlapping*/ true)?;
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/tools/miri/src/shims/x86/mod.rs b/src/tools/miri/src/shims/x86/mod.rs
index b24ea8aec84bd..9cfee20014fc1 100644
--- a/src/tools/miri/src/shims/x86/mod.rs
+++ b/src/tools/miri/src/shims/x86/mod.rs
@@ -1,6 +1,8 @@
 use rand::Rng as _;
 
-use rustc_apfloat::{ieee::Single, Float as _};
+use rustc_apfloat::{ieee::Single, Float};
+use rustc_middle::ty::layout::LayoutOf as _;
+use rustc_middle::ty::Ty;
 use rustc_middle::{mir, ty};
 use rustc_span::Symbol;
 use rustc_target::abi::Size;
@@ -11,6 +13,7 @@ use helpers::bool_to_simd_element;
 use shims::foreign_items::EmulateForeignItemResult;
 
 mod aesni;
+mod avx;
 mod sse;
 mod sse2;
 mod sse3;
@@ -115,6 +118,11 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                     this, link_name, abi, args, dest,
                 );
             }
+            name if name.starts_with("avx.") => {
+                return avx::EvalContextExt::emulate_x86_avx_intrinsic(
+                    this, link_name, abi, args, dest,
+                );
+            }
 
             _ => return Ok(EmulateForeignItemResult::NotSupported),
         }
@@ -296,10 +304,7 @@ fn bin_op_simd_float_first<'tcx, F: rustc_apfloat::Float>(
     this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
 
     for i in 1..dest_len {
-        this.copy_op(
-            &this.project_index(&left, i)?,
-            &this.project_index(&dest, i)?,
-        )?;
+        this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
     }
 
     Ok(())
@@ -420,10 +425,7 @@ fn unary_op_ss<'tcx>(
     this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
 
     for i in 1..dest_len {
-        this.copy_op(
-            &this.project_index(&op, i)?,
-            &this.project_index(&dest, i)?,
-        )?;
+        this.copy_op(&this.project_index(&op, i)?, &this.project_index(&dest, i)?)?;
     }
 
     Ok(())
@@ -479,10 +481,7 @@ fn round_first<'tcx, F: rustc_apfloat::Float>(
     )?;
 
     for i in 1..dest_len {
-        this.copy_op(
-            &this.project_index(&left, i)?,
-            &this.project_index(&dest, i)?,
-        )?;
+        this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
     }
 
     Ok(())
@@ -572,8 +571,65 @@ fn convert_float_to_int<'tcx>(
     Ok(())
 }
 
+/// Splits `left`, `right` and `dest` (which must be SIMD vectors)
+/// into 128-bit chuncks.
+///
+/// `left`, `right` and `dest` cannot have different types.
+///
+/// Returns a tuple where:
+/// * The first element is the number of 128-bit chunks (let's call it `N`).
+/// * The second element is the number of elements per chunk (let's call it `M`).
+/// * The third element is the `left` vector split into chunks, i.e, it's
+///   type is `[[T; M]; N]`.
+/// * The fourth element is the `right` vector split into chunks.
+/// * The fifth element is the `dest` vector split into chunks.
+fn split_simd_to_128bit_chunks<'tcx>(
+    this: &mut crate::MiriInterpCx<'_, 'tcx>,
+    left: &OpTy<'tcx, Provenance>,
+    right: &OpTy<'tcx, Provenance>,
+    dest: &PlaceTy<'tcx, Provenance>,
+) -> InterpResult<
+    'tcx,
+    (u64, u64, MPlaceTy<'tcx, Provenance>, MPlaceTy<'tcx, Provenance>, MPlaceTy<'tcx, Provenance>),
+> {
+    assert_eq!(dest.layout, left.layout);
+    assert_eq!(dest.layout, right.layout);
+
+    let (left, left_len) = this.operand_to_simd(left)?;
+    let (right, right_len) = this.operand_to_simd(right)?;
+    let (dest, dest_len) = this.place_to_simd(dest)?;
+
+    assert_eq!(dest_len, left_len);
+    assert_eq!(dest_len, right_len);
+
+    assert_eq!(dest.layout.size.bits() % 128, 0);
+    let num_chunks = dest.layout.size.bits() / 128;
+    assert_eq!(dest_len.checked_rem(num_chunks), Some(0));
+    let items_per_chunk = dest_len.checked_div(num_chunks).unwrap();
+
+    // Transmute to `[[T; items_per_chunk]; num_chunks]`
+    let element_layout = left.layout.field(this, 0);
+    let chunked_layout = this.layout_of(Ty::new_array(
+        this.tcx.tcx,
+        Ty::new_array(this.tcx.tcx, element_layout.ty, items_per_chunk),
+        num_chunks,
+    ))?;
+    let left = left.transmute(chunked_layout, this)?;
+    let right = right.transmute(chunked_layout, this)?;
+    let dest = dest.transmute(chunked_layout, this)?;
+
+    Ok((num_chunks, items_per_chunk, left, right, dest))
+}
+
 /// Horizontaly performs `which` operation on adjacent values of
 /// `left` and `right` SIMD vectors and stores the result in `dest`.
+/// "Horizontal" means that the i-th output element is calculated
+/// from the elements 2*i and 2*i+1 of the concatenation of `left` and
+/// `right`.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit chunks of `left` and `right`).
 fn horizontal_bin_op<'tcx>(
     this: &mut crate::MiriInterpCx<'_, 'tcx>,
     which: mir::BinOp,
@@ -582,32 +638,34 @@ fn horizontal_bin_op<'tcx>(
     right: &OpTy<'tcx, Provenance>,
     dest: &PlaceTy<'tcx, Provenance>,
 ) -> InterpResult<'tcx, ()> {
-    let (left, left_len) = this.operand_to_simd(left)?;
-    let (right, right_len) = this.operand_to_simd(right)?;
-    let (dest, dest_len) = this.place_to_simd(dest)?;
+    let (num_chunks, items_per_chunk, left, right, dest) =
+        split_simd_to_128bit_chunks(this, left, right, dest)?;
 
-    assert_eq!(dest_len, left_len);
-    assert_eq!(dest_len, right_len);
-    assert_eq!(dest_len % 2, 0);
+    let middle = items_per_chunk / 2;
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
 
-    let middle = dest_len / 2;
-    for i in 0..dest_len {
-        // `i` is the index in `dest`
-        // `j` is the index of the 2-item chunk in `src`
-        let (j, src) =
-            if i < middle { (i, &left) } else { (i.checked_sub(middle).unwrap(), &right) };
-        // `base_i` is the index of the first item of the 2-item chunk in `src`
-        let base_i = j.checked_mul(2).unwrap();
-        let lhs = this.read_immediate(&this.project_index(src, base_i)?)?;
-        let rhs = this.read_immediate(&this.project_index(src, base_i.checked_add(1).unwrap())?)?;
-
-        let res = if saturating {
-            Immediate::from(this.saturating_arith(which, &lhs, &rhs)?)
-        } else {
-            *this.wrapping_binary_op(which, &lhs, &rhs)?
-        };
+        for j in 0..items_per_chunk {
+            // `j` is the index in `dest`
+            // `k` is the index of the 2-item chunk in `src`
+            let (k, src) =
+                if j < middle { (j, &left) } else { (j.checked_sub(middle).unwrap(), &right) };
+            // `base_i` is the index of the first item of the 2-item chunk in `src`
+            let base_i = k.checked_mul(2).unwrap();
+            let lhs = this.read_immediate(&this.project_index(src, base_i)?)?;
+            let rhs =
+                this.read_immediate(&this.project_index(src, base_i.checked_add(1).unwrap())?)?;
+
+            let res = if saturating {
+                Immediate::from(this.saturating_arith(which, &lhs, &rhs)?)
+            } else {
+                *this.wrapping_binary_op(which, &lhs, &rhs)?
+            };
 
-        this.write_immediate(res, &this.project_index(&dest, i)?)?;
+            this.write_immediate(res, &this.project_index(&dest, j)?)?;
+        }
     }
 
     Ok(())
@@ -617,6 +675,10 @@ fn horizontal_bin_op<'tcx>(
 /// `left` and `right` using the high 4 bits in `imm`, sums the calculated
 /// products (up to 4), and conditionally stores the sum in `dest` using
 /// the low 4 bits of `imm`.
+///
+/// Each 128-bit chunk is treated independently (i.e., the value for
+/// the is i-th 128-bit chunk of `dest` is calculated with the i-th
+/// 128-bit blocks of `left` and `right`).
 fn conditional_dot_product<'tcx>(
     this: &mut crate::MiriInterpCx<'_, 'tcx>,
     left: &OpTy<'tcx, Provenance>,
@@ -624,39 +686,43 @@ fn conditional_dot_product<'tcx>(
     imm: &OpTy<'tcx, Provenance>,
     dest: &PlaceTy<'tcx, Provenance>,
 ) -> InterpResult<'tcx, ()> {
-    let (left, left_len) = this.operand_to_simd(left)?;
-    let (right, right_len) = this.operand_to_simd(right)?;
-    let (dest, dest_len) = this.place_to_simd(dest)?;
+    let (num_chunks, items_per_chunk, left, right, dest) =
+        split_simd_to_128bit_chunks(this, left, right, dest)?;
 
-    assert_eq!(left_len, right_len);
-    assert!(dest_len <= 4);
+    let element_layout = left.layout.field(this, 0).field(this, 0);
+    assert!(items_per_chunk <= 4);
 
-    let imm = this.read_scalar(imm)?.to_u8()?;
+    // `imm` is a `u8` for SSE4.1 or an `i32` for AVX :/
+    let imm = this.read_scalar(imm)?.to_uint(imm.layout.size)?;
 
-    let element_layout = left.layout.field(this, 0);
+    for i in 0..num_chunks {
+        let left = this.project_index(&left, i)?;
+        let right = this.project_index(&right, i)?;
+        let dest = this.project_index(&dest, i)?;
 
-    // Calculate dot product
-    // Elements are floating point numbers, but we can use `from_int`
-    // because the representation of 0.0 is all zero bits.
-    let mut sum = ImmTy::from_int(0u8, element_layout);
-    for i in 0..left_len {
-        if imm & (1 << i.checked_add(4).unwrap()) != 0 {
-            let left = this.read_immediate(&this.project_index(&left, i)?)?;
-            let right = this.read_immediate(&this.project_index(&right, i)?)?;
-
-            let mul = this.wrapping_binary_op(mir::BinOp::Mul, &left, &right)?;
-            sum = this.wrapping_binary_op(mir::BinOp::Add, &sum, &mul)?;
+        // Calculate dot product
+        // Elements are floating point numbers, but we can use `from_int`
+        // for the initial value because the representation of 0.0 is all zero bits.
+        let mut sum = ImmTy::from_int(0u8, element_layout);
+        for j in 0..items_per_chunk {
+            if imm & (1 << j.checked_add(4).unwrap()) != 0 {
+                let left = this.read_immediate(&this.project_index(&left, j)?)?;
+                let right = this.read_immediate(&this.project_index(&right, j)?)?;
+
+                let mul = this.wrapping_binary_op(mir::BinOp::Mul, &left, &right)?;
+                sum = this.wrapping_binary_op(mir::BinOp::Add, &sum, &mul)?;
+            }
         }
-    }
 
-    // Write to destination (conditioned to imm)
-    for i in 0..dest_len {
-        let dest = this.project_index(&dest, i)?;
+        // Write to destination (conditioned to imm)
+        for j in 0..items_per_chunk {
+            let dest = this.project_index(&dest, j)?;
 
-        if imm & (1 << i) != 0 {
-            this.write_immediate(*sum, &dest)?;
-        } else {
-            this.write_scalar(Scalar::from_int(0u8, element_layout.size), &dest)?;
+            if imm & (1 << j) != 0 {
+                this.write_immediate(*sum, &dest)?;
+            } else {
+                this.write_scalar(Scalar::from_int(0u8, element_layout.size), &dest)?;
+            }
         }
     }
 
@@ -693,3 +759,36 @@ fn test_bits_masked<'tcx>(
 
     Ok((all_zero, masked_set))
 }
+
+/// Calculates two booleans.
+///
+/// The first is true when the highest bit of each element of `op & mask` is zero.
+/// The second is true when the highest bit of each element of `!op & mask` is zero.
+fn test_high_bits_masked<'tcx>(
+    this: &crate::MiriInterpCx<'_, 'tcx>,
+    op: &OpTy<'tcx, Provenance>,
+    mask: &OpTy<'tcx, Provenance>,
+) -> InterpResult<'tcx, (bool, bool)> {
+    assert_eq!(op.layout, mask.layout);
+
+    let (op, op_len) = this.operand_to_simd(op)?;
+    let (mask, mask_len) = this.operand_to_simd(mask)?;
+
+    assert_eq!(op_len, mask_len);
+
+    let high_bit_offset = op.layout.field(this, 0).size.bits().checked_sub(1).unwrap();
+
+    let mut direct = true;
+    let mut negated = true;
+    for i in 0..op_len {
+        let op = this.project_index(&op, i)?;
+        let mask = this.project_index(&mask, i)?;
+
+        let op = this.read_scalar(&op)?.to_uint(op.layout.size)?;
+        let mask = this.read_scalar(&mask)?.to_uint(mask.layout.size)?;
+        direct &= (op & mask) >> high_bit_offset == 0;
+        negated &= (!op & mask) >> high_bit_offset == 0;
+    }
+
+    Ok((direct, negated))
+}
diff --git a/src/tools/miri/src/shims/x86/sse.rs b/src/tools/miri/src/shims/x86/sse.rs
index 9fb947cb2a3dc..da0db92738faf 100644
--- a/src/tools/miri/src/shims/x86/sse.rs
+++ b/src/tools/miri/src/shims/x86/sse.rs
@@ -208,10 +208,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 this.write_immediate(*res0, &dest0)?;
 
                 for i in 1..dest_len {
-                    this.copy_op(
-                        &this.project_index(&left, i)?,
-                        &this.project_index(&dest, i)?,
-                    )?;
+                    this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
                 }
             }
             _ => return Ok(EmulateForeignItemResult::NotSupported),
diff --git a/src/tools/miri/src/shims/x86/sse2.rs b/src/tools/miri/src/shims/x86/sse2.rs
index e5c8267320a45..b34b93e373900 100644
--- a/src/tools/miri/src/shims/x86/sse2.rs
+++ b/src/tools/miri/src/shims/x86/sse2.rs
@@ -440,10 +440,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                 this.write_scalar(res0, &this.project_index(&dest, 0)?)?;
 
                 for i in 1..dest_len {
-                    this.copy_op(
-                        &this.project_index(&op, i)?,
-                        &this.project_index(&dest, i)?,
-                    )?;
+                    this.copy_op(&this.project_index(&op, i)?, &this.project_index(&dest, i)?)?;
                 }
             }
             // Used to implement _mm_sqrt_pd functions.
@@ -580,10 +577,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
 
                 // Copy remianing from `left`
                 for i in 1..dest_len {
-                    this.copy_op(
-                        &this.project_index(&left, i)?,
-                        &this.project_index(&dest, i)?,
-                    )?;
+                    this.copy_op(&this.project_index(&left, i)?, &this.project_index(&dest, i)?)?;
                 }
             }
             // Used to implement the `_mm_pause` function.
diff --git a/src/tools/miri/src/shims/x86/sse41.rs b/src/tools/miri/src/shims/x86/sse41.rs
index 2abd10fa7a77d..32b1fe43c5837 100644
--- a/src/tools/miri/src/shims/x86/sse41.rs
+++ b/src/tools/miri/src/shims/x86/sse41.rs
@@ -57,10 +57,7 @@ pub(super) trait EvalContextExt<'mir, 'tcx: 'mir>:
                         this.write_immediate(*src_value, &dest)?;
                     } else {
                         // copy from `left`
-                        this.copy_op(
-                            &this.project_index(&left, i)?,
-                            &dest,
-                        )?;
+                        this.copy_op(&this.project_index(&left, i)?, &dest)?;
                     }
                 }
             }
diff --git a/src/tools/miri/tests/fail/enum-set-discriminant-niche-variant-wrong.rs b/src/tools/miri/tests/fail/enum-set-discriminant-niche-variant-wrong.rs
index 7097aa0c43ae3..428f371ca51c7 100644
--- a/src/tools/miri/tests/fail/enum-set-discriminant-niche-variant-wrong.rs
+++ b/src/tools/miri/tests/fail/enum-set-discriminant-niche-variant-wrong.rs
@@ -4,11 +4,11 @@
 use std::intrinsics::mir::*;
 use std::num::NonZeroI32;
 
-// We define our own option type so that we can control the varian indices.
+// We define our own option type so that we can control the variant indices.
 #[allow(unused)]
 enum Option<T> {
-    None,
-    Some(T),
+    None,    // variant 0
+    Some(T), // variant 1
 }
 use Option::*;
 
diff --git a/src/tools/miri/tests/fail/issue-miri-3288-ice-symbolic-alignment-extern-static.rs b/src/tools/miri/tests/fail/issue-miri-3288-ice-symbolic-alignment-extern-static.rs
index 446040749822e..fef5a6cddb91d 100644
--- a/src/tools/miri/tests/fail/issue-miri-3288-ice-symbolic-alignment-extern-static.rs
+++ b/src/tools/miri/tests/fail/issue-miri-3288-ice-symbolic-alignment-extern-static.rs
@@ -4,8 +4,7 @@ extern "C" {
     static _dispatch_queue_attr_concurrent: [u8; 0];
 }
 
-static DISPATCH_QUEUE_CONCURRENT: &'static [u8; 0] =
-    unsafe { &_dispatch_queue_attr_concurrent };
+static DISPATCH_QUEUE_CONCURRENT: &'static [u8; 0] = unsafe { &_dispatch_queue_attr_concurrent };
 
 fn main() {
     let _val = *DISPATCH_QUEUE_CONCURRENT; //~ERROR: is not supported
diff --git a/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond.rs b/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond.rs
index b0325f7d78e50..f362caa11dc59 100644
--- a/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond.rs
+++ b/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond.rs
@@ -22,7 +22,7 @@ fn test_timed_wait_timeout(clock_id: i32) {
         let mut now_mu: MaybeUninit<libc::timespec> = MaybeUninit::uninit();
         assert_eq!(libc::clock_gettime(clock_id, now_mu.as_mut_ptr()), 0);
         let now = now_mu.assume_init();
-        // Waiting for a second... mostly because waiting less requires mich more tricky arithmetic.
+        // Waiting for a second... mostly because waiting less requires much more tricky arithmetic.
         // FIXME: wait less.
         let timeout = libc::timespec { tv_sec: now.tv_sec + 1, tv_nsec: now.tv_nsec };
 
diff --git a/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond_isolated.rs b/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond_isolated.rs
index 103ce44006d3a..66c0895a5dab0 100644
--- a/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond_isolated.rs
+++ b/src/tools/miri/tests/pass-dep/concurrency/libc_pthread_cond_isolated.rs
@@ -21,7 +21,7 @@ fn test_timed_wait_timeout(clock_id: i32) {
         let mut now_mu: MaybeUninit<libc::timespec> = MaybeUninit::uninit();
         assert_eq!(libc::clock_gettime(clock_id, now_mu.as_mut_ptr()), 0);
         let now = now_mu.assume_init();
-        // Waiting for a second... mostly because waiting less requires mich more tricky arithmetic.
+        // Waiting for a second... mostly because waiting less requires much more tricky arithmetic.
         // FIXME: wait less.
         let timeout = libc::timespec { tv_sec: now.tv_sec + 1, tv_nsec: now.tv_nsec };
 
diff --git a/src/tools/miri/tests/pass-dep/shims/mmap.rs b/src/tools/miri/tests/pass-dep/shims/mmap.rs
index e19f54d0687df..7bbb9dd53cb87 100644
--- a/src/tools/miri/tests/pass-dep/shims/mmap.rs
+++ b/src/tools/miri/tests/pass-dep/shims/mmap.rs
@@ -5,16 +5,25 @@
 use std::io::Error;
 use std::{ptr, slice};
 
-fn test_mmap() {
+fn test_mmap<Offset: Default>(
+    mmap: unsafe extern "C" fn(
+        *mut libc::c_void,
+        libc::size_t,
+        libc::c_int,
+        libc::c_int,
+        libc::c_int,
+        Offset,
+    ) -> *mut libc::c_void,
+) {
     let page_size = page_size::get();
     let ptr = unsafe {
-        libc::mmap(
+        mmap(
             ptr::null_mut(),
             page_size,
             libc::PROT_READ | libc::PROT_WRITE,
             libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
             -1,
-            0,
+            Default::default(),
         )
     };
     assert!(!ptr.is_null());
@@ -35,40 +44,40 @@ fn test_mmap() {
 
     // Test all of our error conditions
     let ptr = unsafe {
-        libc::mmap(
+        mmap(
             ptr::null_mut(),
             page_size,
             libc::PROT_READ | libc::PROT_WRITE,
             libc::MAP_PRIVATE | libc::MAP_SHARED, // Can't be both private and shared
             -1,
-            0,
+            Default::default(),
         )
     };
     assert_eq!(ptr, libc::MAP_FAILED);
     assert_eq!(Error::last_os_error().raw_os_error().unwrap(), libc::EINVAL);
 
     let ptr = unsafe {
-        libc::mmap(
+        mmap(
             ptr::null_mut(),
             0, // Can't map no memory
             libc::PROT_READ | libc::PROT_WRITE,
             libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
             -1,
-            0,
+            Default::default(),
         )
     };
     assert_eq!(ptr, libc::MAP_FAILED);
     assert_eq!(Error::last_os_error().raw_os_error().unwrap(), libc::EINVAL);
 
     let ptr = unsafe {
-        libc::mmap(
+        mmap(
             ptr::invalid_mut(page_size * 64),
             page_size,
             libc::PROT_READ | libc::PROT_WRITE,
             // We don't support MAP_FIXED
             libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | libc::MAP_FIXED,
             -1,
-            0,
+            Default::default(),
         )
     };
     assert_eq!(ptr, libc::MAP_FAILED);
@@ -77,13 +86,13 @@ fn test_mmap() {
     // We don't support protections other than read+write
     for prot in [libc::PROT_NONE, libc::PROT_EXEC, libc::PROT_READ, libc::PROT_WRITE] {
         let ptr = unsafe {
-            libc::mmap(
+            mmap(
                 ptr::null_mut(),
                 page_size,
                 prot,
                 libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
                 -1,
-                0,
+                Default::default(),
             )
         };
         assert_eq!(ptr, libc::MAP_FAILED);
@@ -93,13 +102,13 @@ fn test_mmap() {
     // We report an error for mappings whose length cannot be rounded up to a multiple of
     // the page size.
     let ptr = unsafe {
-        libc::mmap(
+        mmap(
             ptr::null_mut(),
             usize::MAX - 1,
             libc::PROT_READ | libc::PROT_WRITE,
             libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
             -1,
-            0,
+            Default::default(),
         )
     };
     assert_eq!(ptr, libc::MAP_FAILED);
@@ -163,7 +172,9 @@ fn test_mremap() {
 }
 
 fn main() {
-    test_mmap();
+    test_mmap(libc::mmap);
+    #[cfg(target_os = "linux")]
+    test_mmap(libc::mmap64);
     #[cfg(target_os = "linux")]
     test_mremap();
 }
diff --git a/src/tools/miri/tests/pass-dep/shims/pthread-sync.rs b/src/tools/miri/tests/pass-dep/shims/pthread-sync.rs
index 4cc5b7d68a3cc..077bbfff1645f 100644
--- a/src/tools/miri/tests/pass-dep/shims/pthread-sync.rs
+++ b/src/tools/miri/tests/pass-dep/shims/pthread-sync.rs
@@ -1,24 +1,34 @@
 //@ignore-target-windows: No libc on Windows
+// We use `yield` to test specific interleavings, so disable automatic preemption.
+//@compile-flags: -Zmiri-preemption-rate=0
+#![feature(sync_unsafe_cell)]
+
+use std::cell::SyncUnsafeCell;
+use std::thread;
+use std::{mem, ptr};
 
 fn main() {
     test_mutex_libc_init_recursive();
     test_mutex_libc_init_normal();
     test_mutex_libc_init_errorcheck();
     test_rwlock_libc_static_initializer();
-
     #[cfg(target_os = "linux")]
     test_mutex_libc_static_initializer_recursive();
+
+    test_mutex();
+    check_rwlock_write();
+    check_rwlock_read_no_deadlock();
 }
 
 fn test_mutex_libc_init_recursive() {
     unsafe {
-        let mut attr: libc::pthread_mutexattr_t = std::mem::zeroed();
+        let mut attr: libc::pthread_mutexattr_t = mem::zeroed();
         assert_eq!(libc::pthread_mutexattr_init(&mut attr as *mut _), 0);
         assert_eq!(
             libc::pthread_mutexattr_settype(&mut attr as *mut _, libc::PTHREAD_MUTEX_RECURSIVE),
             0,
         );
-        let mut mutex: libc::pthread_mutex_t = std::mem::zeroed();
+        let mut mutex: libc::pthread_mutex_t = mem::zeroed();
         assert_eq!(libc::pthread_mutex_init(&mut mutex as *mut _, &mut attr as *mut _), 0);
         assert_eq!(libc::pthread_mutex_lock(&mut mutex as *mut _), 0);
         assert_eq!(libc::pthread_mutex_trylock(&mut mutex as *mut _), 0);
@@ -36,7 +46,7 @@ fn test_mutex_libc_init_recursive() {
 
 fn test_mutex_libc_init_normal() {
     unsafe {
-        let mut mutexattr: libc::pthread_mutexattr_t = std::mem::zeroed();
+        let mut mutexattr: libc::pthread_mutexattr_t = mem::zeroed();
         assert_eq!(
             libc::pthread_mutexattr_settype(&mut mutexattr as *mut _, 0x12345678),
             libc::EINVAL,
@@ -45,7 +55,7 @@ fn test_mutex_libc_init_normal() {
             libc::pthread_mutexattr_settype(&mut mutexattr as *mut _, libc::PTHREAD_MUTEX_NORMAL),
             0,
         );
-        let mut mutex: libc::pthread_mutex_t = std::mem::zeroed();
+        let mut mutex: libc::pthread_mutex_t = mem::zeroed();
         assert_eq!(libc::pthread_mutex_init(&mut mutex as *mut _, &mutexattr as *const _), 0);
         assert_eq!(libc::pthread_mutex_lock(&mut mutex as *mut _), 0);
         assert_eq!(libc::pthread_mutex_trylock(&mut mutex as *mut _), libc::EBUSY);
@@ -58,7 +68,7 @@ fn test_mutex_libc_init_normal() {
 
 fn test_mutex_libc_init_errorcheck() {
     unsafe {
-        let mut mutexattr: libc::pthread_mutexattr_t = std::mem::zeroed();
+        let mut mutexattr: libc::pthread_mutexattr_t = mem::zeroed();
         assert_eq!(
             libc::pthread_mutexattr_settype(
                 &mut mutexattr as *mut _,
@@ -66,7 +76,7 @@ fn test_mutex_libc_init_errorcheck() {
             ),
             0,
         );
-        let mut mutex: libc::pthread_mutex_t = std::mem::zeroed();
+        let mut mutex: libc::pthread_mutex_t = mem::zeroed();
         assert_eq!(libc::pthread_mutex_init(&mut mutex as *mut _, &mutexattr as *const _), 0);
         assert_eq!(libc::pthread_mutex_lock(&mut mutex as *mut _), 0);
         assert_eq!(libc::pthread_mutex_trylock(&mut mutex as *mut _), libc::EBUSY);
@@ -98,9 +108,113 @@ fn test_mutex_libc_static_initializer_recursive() {
     }
 }
 
-// Testing the behavior of std::sync::RwLock does not fully exercise the pthread rwlock shims, we
-// need to go a layer deeper and test the behavior of the libc functions, because
-// std::sys::unix::rwlock::RWLock itself keeps track of write_locked and num_readers.
+struct SendPtr<T> {
+    ptr: *mut T,
+}
+unsafe impl<T> Send for SendPtr<T> {}
+impl<T> Copy for SendPtr<T> {}
+impl<T> Clone for SendPtr<T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+fn test_mutex() {
+    // Specifically *not* using `Arc` to make sure there is no synchronization apart from the mutex.
+    unsafe {
+        let data = SyncUnsafeCell::new((libc::PTHREAD_MUTEX_INITIALIZER, 0));
+        let ptr = SendPtr { ptr: data.get() };
+        let mut threads = Vec::new();
+
+        for _ in 0..3 {
+            let thread = thread::spawn(move || {
+                let ptr = ptr; // circumvent per-field closure capture
+                let mutexptr = ptr::addr_of_mut!((*ptr.ptr).0);
+                assert_eq!(libc::pthread_mutex_lock(mutexptr), 0);
+                thread::yield_now();
+                (*ptr.ptr).1 += 1;
+                assert_eq!(libc::pthread_mutex_unlock(mutexptr), 0);
+            });
+            threads.push(thread);
+        }
+
+        for thread in threads {
+            thread.join().unwrap();
+        }
+
+        let mutexptr = ptr::addr_of_mut!((*ptr.ptr).0);
+        assert_eq!(libc::pthread_mutex_trylock(mutexptr), 0);
+        assert_eq!((*ptr.ptr).1, 3);
+    }
+}
+
+fn check_rwlock_write() {
+    unsafe {
+        let data = SyncUnsafeCell::new((libc::PTHREAD_RWLOCK_INITIALIZER, 0));
+        let ptr = SendPtr { ptr: data.get() };
+        let mut threads = Vec::new();
+
+        for _ in 0..3 {
+            let thread = thread::spawn(move || {
+                let ptr = ptr; // circumvent per-field closure capture
+                let rwlockptr = ptr::addr_of_mut!((*ptr.ptr).0);
+                assert_eq!(libc::pthread_rwlock_wrlock(rwlockptr), 0);
+                thread::yield_now();
+                (*ptr.ptr).1 += 1;
+                assert_eq!(libc::pthread_rwlock_unlock(rwlockptr), 0);
+            });
+            threads.push(thread);
+
+            let readthread = thread::spawn(move || {
+                let ptr = ptr; // circumvent per-field closure capture
+                let rwlockptr = ptr::addr_of_mut!((*ptr.ptr).0);
+                assert_eq!(libc::pthread_rwlock_rdlock(rwlockptr), 0);
+                thread::yield_now();
+                let val = (*ptr.ptr).1;
+                assert!(val >= 0 && val <= 3);
+                assert_eq!(libc::pthread_rwlock_unlock(rwlockptr), 0);
+            });
+            threads.push(readthread);
+        }
+
+        for thread in threads {
+            thread.join().unwrap();
+        }
+
+        let rwlockptr = ptr::addr_of_mut!((*ptr.ptr).0);
+        assert_eq!(libc::pthread_rwlock_tryrdlock(rwlockptr), 0);
+        assert_eq!((*ptr.ptr).1, 3);
+    }
+}
+
+fn check_rwlock_read_no_deadlock() {
+    unsafe {
+        let l1 = SyncUnsafeCell::new(libc::PTHREAD_RWLOCK_INITIALIZER);
+        let l1 = SendPtr { ptr: l1.get() };
+        let l2 = SyncUnsafeCell::new(libc::PTHREAD_RWLOCK_INITIALIZER);
+        let l2 = SendPtr { ptr: l2.get() };
+
+        // acquire l1 and hold it until after the other thread is done
+        assert_eq!(libc::pthread_rwlock_rdlock(l1.ptr), 0);
+        let handle = thread::spawn(move || {
+            let l1 = l1; // circumvent per-field closure capture
+            let l2 = l2; // circumvent per-field closure capture
+            // acquire l2 before the other thread
+            assert_eq!(libc::pthread_rwlock_rdlock(l2.ptr), 0);
+            thread::yield_now();
+            assert_eq!(libc::pthread_rwlock_rdlock(l1.ptr), 0);
+            thread::yield_now();
+            assert_eq!(libc::pthread_rwlock_unlock(l1.ptr), 0);
+            assert_eq!(libc::pthread_rwlock_unlock(l2.ptr), 0);
+        });
+        thread::yield_now();
+        assert_eq!(libc::pthread_rwlock_rdlock(l2.ptr), 0);
+        handle.join().unwrap();
+    }
+}
+
+// std::sync::RwLock does not even used pthread_rwlock any more.
+// Do some smoke testing of the API surface.
 fn test_rwlock_libc_static_initializer() {
     let rw = std::cell::UnsafeCell::new(libc::PTHREAD_RWLOCK_INITIALIZER);
     unsafe {
diff --git a/src/tools/miri/tests/pass/align_offset_symbolic.rs b/src/tools/miri/tests/pass/align_offset_symbolic.rs
index e96f11b1efa72..ac28c63e08110 100644
--- a/src/tools/miri/tests/pass/align_offset_symbolic.rs
+++ b/src/tools/miri/tests/pass/align_offset_symbolic.rs
@@ -113,7 +113,7 @@ fn vtable() {
 
     let ptr: &dyn Send = &0;
     let parts: (*const (), *const u8) = unsafe { mem::transmute(ptr) };
-    let vtable = parts.1 ;
+    let vtable = parts.1;
     let offset = vtable.align_offset(mem::align_of::<TWOPTR>());
     let _vtable_aligned = vtable.wrapping_add(offset) as *const [TWOPTR; 0];
     // FIXME: we can't actually do the access since vtable pointers act like zero-sized allocations.
diff --git a/src/tools/miri/tests/pass/concurrency/sync.rs b/src/tools/miri/tests/pass/concurrency/sync.rs
index e93e617fd2620..1d48e5312d496 100644
--- a/src/tools/miri/tests/pass/concurrency/sync.rs
+++ b/src/tools/miri/tests/pass/concurrency/sync.rs
@@ -1,6 +1,7 @@
 //@revisions: stack tree
 //@[tree]compile-flags: -Zmiri-tree-borrows
-//@compile-flags: -Zmiri-disable-isolation -Zmiri-strict-provenance
+// We use `yield` to test specific interleavings, so disable automatic preemption.
+//@compile-flags: -Zmiri-disable-isolation -Zmiri-strict-provenance -Zmiri-preemption-rate=0
 
 use std::sync::{Arc, Barrier, Condvar, Mutex, Once, RwLock};
 use std::thread;
@@ -119,13 +120,25 @@ fn check_rwlock_write() {
     let mut threads = Vec::new();
 
     for _ in 0..3 {
-        let data = Arc::clone(&data);
-        let thread = thread::spawn(move || {
-            let mut data = data.write().unwrap();
-            thread::yield_now();
-            *data += 1;
+        let thread = thread::spawn({
+            let data = Arc::clone(&data);
+            move || {
+                let mut data = data.write().unwrap();
+                thread::yield_now();
+                *data += 1;
+            }
         });
         threads.push(thread);
+
+        let readthread = thread::spawn({
+            let data = Arc::clone(&data);
+            move || {
+                let data = data.read().unwrap();
+                thread::yield_now();
+                assert!(*data >= 0 && *data <= 3);
+            }
+        });
+        threads.push(readthread);
     }
 
     for thread in threads {
@@ -144,8 +157,10 @@ fn check_rwlock_read_no_deadlock() {
 
     let l1_copy = Arc::clone(&l1);
     let l2_copy = Arc::clone(&l2);
+    // acquire l1 and hold it until after the other thread is done
     let _guard1 = l1.read().unwrap();
     let handle = thread::spawn(move || {
+        // acquire l2 before the other thread
         let _guard2 = l2_copy.read().unwrap();
         thread::yield_now();
         let _guard1 = l1_copy.read().unwrap();
diff --git a/src/tools/miri/tests/pass/imported_main.rs b/src/tools/miri/tests/pass/imported_main.rs
new file mode 100644
index 0000000000000..32b39152f7839
--- /dev/null
+++ b/src/tools/miri/tests/pass/imported_main.rs
@@ -0,0 +1,8 @@
+#![feature(imported_main)]
+
+pub mod foo {
+    pub fn mymain() {
+        println!("Hello, world!");
+    }
+}
+use foo::mymain as main;
diff --git a/src/tools/miri/tests/pass/imported_main.stdout b/src/tools/miri/tests/pass/imported_main.stdout
new file mode 100644
index 0000000000000..af5626b4a114a
--- /dev/null
+++ b/src/tools/miri/tests/pass/imported_main.stdout
@@ -0,0 +1 @@
+Hello, world!
diff --git a/src/tools/miri/tests/pass/intrinsics-x86-avx.rs b/src/tools/miri/tests/pass/intrinsics-x86-avx.rs
index 933e3d4153ada..7d43cc596aedb 100644
--- a/src/tools/miri/tests/pass/intrinsics-x86-avx.rs
+++ b/src/tools/miri/tests/pass/intrinsics-x86-avx.rs
@@ -25,6 +25,528 @@ fn main() {
 
 #[target_feature(enable = "avx")]
 unsafe fn test_avx() {
+    // Mostly copied from library/stdarch/crates/core_arch/src/x86/avx.rs
+
+    macro_rules! assert_approx_eq {
+        ($a:expr, $b:expr, $eps:expr) => {{
+            let (a, b) = (&$a, &$b);
+            assert!(
+                (*a - *b).abs() < $eps,
+                "assertion failed: `(left !== right)` \
+             (left: `{:?}`, right: `{:?}`, expect diff: `{:?}`, real diff: `{:?}`)",
+                *a,
+                *b,
+                $eps,
+                (*a - *b).abs()
+            );
+        }};
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_max_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_max_pd(a, b);
+        let e = _mm256_setr_pd(2., 4., 6., 8.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+    test_mm256_max_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_max_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_max_ps(a, b);
+        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+    test_mm256_max_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_min_pd() {
+        let a = _mm256_setr_pd(1., 4., 5., 8.);
+        let b = _mm256_setr_pd(2., 3., 6., 7.);
+        let r = _mm256_min_pd(a, b);
+        let e = _mm256_setr_pd(1., 3., 5., 7.);
+        assert_eq_m256d(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
+        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
+        let wu: [u64; 4] = transmute(w);
+        let xu: [u64; 4] = transmute(x);
+        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
+        assert_eq!(xu, [0u64; 4]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
+        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
+        let yf: [f64; 4] = transmute(y);
+        let zf: [f64; 4] = transmute(z);
+        assert_eq!(yf, [0.0; 4]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+    test_mm256_min_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_min_ps() {
+        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
+        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
+        let r = _mm256_min_ps(a, b);
+        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
+        assert_eq_m256(r, e);
+        // > If the values being compared are both 0.0s (of either sign), the
+        // > value in the second operand (source operand) is returned.
+        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
+        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
+        let wu: [u32; 8] = transmute(w);
+        let xu: [u32; 8] = transmute(x);
+        assert_eq!(wu, [0x8000_0000u32; 8]);
+        assert_eq!(xu, [0u32; 8]);
+        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
+        // > second operand (source operand), either a NaN or a valid
+        // > floating-point value, is written to the result.
+        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
+        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
+        let yf: [f32; 8] = transmute(y);
+        let zf: [f32; 8] = transmute(z);
+        assert_eq!(yf, [0.0; 8]);
+        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
+    }
+    test_mm256_min_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_nearest_f32() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f32, res: f32) {
+            let a = _mm256_set1_ps(x);
+            let e = _mm256_set1_ps(res);
+            let r = _mm256_round_ps::<_MM_FROUND_TO_NEAREST_INT>(a);
+            assert_eq_m256(r, e);
+            // Assume round-to-nearest by default
+            let r = _mm256_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+            assert_eq_m256(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -2.0);
+        test(-1.5, -2.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 2.0);
+        test(1.75, 2.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_ps(1.5, 3.5, 5.5, 7.5, 9.5, 11.5, 13.5, 15.5);
+        let e = _mm256_setr_ps(2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0);
+        let r = _mm256_round_ps::<_MM_FROUND_TO_NEAREST_INT>(a);
+        assert_eq_m256(r, e);
+        // Assume round-to-nearest by default
+        let r = _mm256_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
+        assert_eq_m256(r, e);
+    }
+    test_round_nearest_f32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_floor_f32() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f32, res: f32) {
+            let a = _mm256_set1_ps(x);
+            let e = _mm256_set1_ps(res);
+            let r = _mm256_floor_ps(a);
+            assert_eq_m256(r, e);
+            let r = _mm256_round_ps::<_MM_FROUND_TO_NEG_INF>(a);
+            assert_eq_m256(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -3.0);
+        test(-1.75, -2.0);
+        test(-1.5, -2.0);
+        test(-1.25, -2.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 1.0);
+        test(1.75, 1.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_ps(1.5, 3.5, 5.5, 7.5, 9.5, 11.5, 13.5, 15.5);
+        let e = _mm256_setr_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
+        let r = _mm256_floor_ps(a);
+        assert_eq_m256(r, e);
+        let r = _mm256_round_ps::<_MM_FROUND_TO_NEG_INF>(a);
+        assert_eq_m256(r, e);
+    }
+    test_round_floor_f32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_ceil_f32() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f32, res: f32) {
+            let a = _mm256_set1_ps(x);
+            let e = _mm256_set1_ps(res);
+            let r = _mm256_ceil_ps(a);
+            assert_eq_m256(r, e);
+            let r = _mm256_round_ps::<_MM_FROUND_TO_POS_INF>(a);
+            assert_eq_m256(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -1.0);
+        test(-1.5, -1.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 2.0);
+        test(1.5, 2.0);
+        test(1.75, 2.0);
+        test(2.5, 3.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_ps(1.5, 3.5, 5.5, 7.5, 9.5, 11.5, 13.5, 15.5);
+        let e = _mm256_setr_ps(2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0);
+        let r = _mm256_ceil_ps(a);
+        assert_eq_m256(r, e);
+        let r = _mm256_round_ps::<_MM_FROUND_TO_POS_INF>(a);
+        assert_eq_m256(r, e);
+    }
+    test_round_ceil_f32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_trunc_f32() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f32, res: f32) {
+            let a = _mm256_set1_ps(x);
+            let e = _mm256_set1_ps(res);
+            let r = _mm256_round_ps::<_MM_FROUND_TO_ZERO>(a);
+            assert_eq_m256(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -1.0);
+        test(-1.5, -1.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 1.0);
+        test(1.75, 1.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_ps(1.5, 3.5, 5.5, 7.5, 9.5, 11.5, 13.5, 15.5);
+        let e = _mm256_setr_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
+        let r = _mm256_round_ps::<_MM_FROUND_TO_ZERO>(a);
+        assert_eq_m256(r, e);
+    }
+    test_round_trunc_f32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_nearest_f64() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f64, res: f64) {
+            let a = _mm256_set1_pd(x);
+            let e = _mm256_set1_pd(res);
+            let r = _mm256_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
+            assert_eq_m256d(r, e);
+            // Assume round-to-nearest by default
+            let r = _mm256_round_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+            assert_eq_m256d(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -2.0);
+        test(-1.5, -2.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 2.0);
+        test(1.75, 2.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_pd(1.5, 3.5, 5.5, 7.5);
+        let e = _mm256_setr_pd(2.0, 4.0, 6.0, 8.0);
+        let r = _mm256_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
+        assert_eq_m256d(r, e);
+        // Assume round-to-nearest by default
+        let r = _mm256_round_pd::<_MM_FROUND_CUR_DIRECTION>(a);
+        assert_eq_m256d(r, e);
+    }
+    test_round_nearest_f64();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_floor_f64() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f64, res: f64) {
+            let a = _mm256_set1_pd(x);
+            let e = _mm256_set1_pd(res);
+            let r = _mm256_floor_pd(a);
+            assert_eq_m256d(r, e);
+            let r = _mm256_round_pd::<_MM_FROUND_TO_NEG_INF>(a);
+            assert_eq_m256d(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -3.0);
+        test(-1.75, -2.0);
+        test(-1.5, -2.0);
+        test(-1.25, -2.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 1.0);
+        test(1.75, 1.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_pd(1.5, 3.5, 5.5, 7.5);
+        let e = _mm256_setr_pd(1.0, 3.0, 5.0, 7.0);
+        let r = _mm256_floor_pd(a);
+        assert_eq_m256d(r, e);
+        let r = _mm256_round_pd::<_MM_FROUND_TO_NEG_INF>(a);
+        assert_eq_m256d(r, e);
+    }
+    test_round_floor_f64();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_ceil_f64() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f64, res: f64) {
+            let a = _mm256_set1_pd(x);
+            let e = _mm256_set1_pd(res);
+            let r = _mm256_ceil_pd(a);
+            assert_eq_m256d(r, e);
+            let r = _mm256_round_pd::<_MM_FROUND_TO_POS_INF>(a);
+            assert_eq_m256d(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -1.0);
+        test(-1.5, -1.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 2.0);
+        test(1.5, 2.0);
+        test(1.75, 2.0);
+        test(2.5, 3.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_pd(1.5, 3.5, 5.5, 7.5);
+        let e = _mm256_setr_pd(2.0, 4.0, 6.0, 8.0);
+        let r = _mm256_ceil_pd(a);
+        assert_eq_m256d(r, e);
+        let r = _mm256_round_pd::<_MM_FROUND_TO_POS_INF>(a);
+        assert_eq_m256d(r, e);
+    }
+    test_round_ceil_f64();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_round_trunc_f64() {
+        #[target_feature(enable = "avx")]
+        unsafe fn test(x: f64, res: f64) {
+            let a = _mm256_set1_pd(x);
+            let e = _mm256_set1_pd(res);
+            let r = _mm256_round_pd::<_MM_FROUND_TO_ZERO>(a);
+            assert_eq_m256d(r, e);
+        }
+
+        // Test rounding direction
+        test(-2.5, -2.0);
+        test(-1.75, -1.0);
+        test(-1.5, -1.0);
+        test(-1.25, -1.0);
+        test(-1.0, -1.0);
+        test(0.0, 0.0);
+        test(1.0, 1.0);
+        test(1.25, 1.0);
+        test(1.5, 1.0);
+        test(1.75, 1.0);
+        test(2.5, 2.0);
+
+        // Test that each element is rounded
+        let a = _mm256_setr_pd(1.5, 3.5, 5.5, 7.5);
+        let e = _mm256_setr_pd(1.0, 3.0, 5.0, 7.0);
+        let r = _mm256_round_pd::<_MM_FROUND_TO_ZERO>(a);
+        assert_eq_m256d(r, e);
+    }
+    test_round_trunc_f64();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_sqrt_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_sqrt_ps(a);
+        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_sqrt_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_rcp_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rcp_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.49987793, 0.33325195, 0.24993896,
+            0.19995117, 0.16662598, 0.14282227, 0.12496948,
+        );
+        let rel_err = 0.00048828125;
+
+        let r: [f32; 8] = transmute(r);
+        let e: [f32; 8] = transmute(e);
+        for i in 0..8 {
+            assert_approx_eq!(r[i], e[i], 2. * rel_err);
+        }
+    }
+    test_mm256_rcp_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_rsqrt_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        let r = _mm256_rsqrt_ps(a);
+        #[rustfmt::skip]
+        let e = _mm256_setr_ps(
+            0.99975586, 0.7069092, 0.5772705, 0.49987793,
+            0.44714355, 0.40820313, 0.3779297, 0.3534546,
+        );
+        let rel_err = 0.00048828125;
+
+        let r: [f32; 8] = transmute(r);
+        let e: [f32; 8] = transmute(e);
+        for i in 0..8 {
+            assert_approx_eq!(r[i], e[i], 2. * rel_err);
+        }
+    }
+    test_mm256_rsqrt_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_dp_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_dp_ps::<0xFF>(a, b);
+        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_dp_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_hadd_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(13., 7., 41., 7.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hadd_pd(a, b);
+        let e = _mm256_setr_pd(3., 11., 7., 15.);
+        assert_eq_m256d(r, e);
+    }
+    test_mm256_hadd_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_hadd_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hadd_ps(a, b);
+        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_hadd_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_hsub_pd() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let b = _mm256_setr_pd(4., 3., 2., 5.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-5., 1., -9., -3.);
+        assert_eq_m256d(r, e);
+
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_hsub_pd(a, b);
+        let e = _mm256_setr_pd(-1., -1., -1., -1.);
+        assert_eq_m256d(r, e);
+    }
+    test_mm256_hsub_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_hsub_ps() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
+        assert_eq_m256(r, e);
+
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_hsub_ps(a, b);
+        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_hsub_ps();
+
     fn expected_cmp<F: PartialOrd>(imm: i32, lhs: F, rhs: F, if_t: F, if_f: F) -> F {
         let res = match imm {
             _CMP_EQ_OQ => lhs == rhs,
@@ -135,12 +657,54 @@ unsafe fn test_avx() {
         }
     }
 
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cmp_ps<const IMM: i32>() {
+        let values = [
+            (1.0, 1.0),
+            (0.0, 1.0),
+            (1.0, 0.0),
+            (f32::NAN, 0.0),
+            (0.0, f32::NAN),
+            (f32::NAN, f32::NAN),
+        ];
+
+        for (lhs, rhs) in values {
+            let a = _mm256_set1_ps(lhs);
+            let b = _mm256_set1_ps(rhs);
+            let r: [u32; 8] = transmute(_mm256_cmp_ps::<IMM>(a, b));
+            let e: [u32; 8] = transmute(_mm256_set1_ps(expected_cmp_f32(IMM, lhs, rhs)));
+            assert_eq!(r, e);
+        }
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cmp_pd<const IMM: i32>() {
+        let values = [
+            (1.0, 1.0),
+            (0.0, 1.0),
+            (1.0, 0.0),
+            (f64::NAN, 0.0),
+            (0.0, f64::NAN),
+            (f64::NAN, f64::NAN),
+        ];
+
+        for (lhs, rhs) in values {
+            let a = _mm256_set1_pd(lhs);
+            let b = _mm256_set1_pd(rhs);
+            let r: [u64; 4] = transmute(_mm256_cmp_pd::<IMM>(a, b));
+            let e: [u64; 4] = transmute(_mm256_set1_pd(expected_cmp_f64(IMM, lhs, rhs)));
+            assert_eq!(r, e);
+        }
+    }
+
     #[target_feature(enable = "avx")]
     unsafe fn test_cmp<const IMM: i32>() {
         test_mm_cmp_ss::<IMM>();
         test_mm_cmp_ps::<IMM>();
         test_mm_cmp_sd::<IMM>();
         test_mm_cmp_pd::<IMM>();
+        test_mm256_cmp_ps::<IMM>();
+        test_mm256_cmp_pd::<IMM>();
     }
 
     test_cmp::<_CMP_EQ_OQ>();
@@ -159,4 +723,709 @@ unsafe fn test_avx() {
     test_cmp::<_CMP_GE_OS>();
     test_cmp::<_CMP_GT_OS>();
     test_cmp::<_CMP_TRUE_US>();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cvtps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvtps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+
+        let a = _mm256_setr_ps(
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            f32::MIN,
+            f32::MAX,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+        );
+        let r = _mm256_cvtps_epi32(a);
+        assert_eq_m256i(r, _mm256_set1_epi32(i32::MIN));
+    }
+    test_mm256_cvtps_epi32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cvttps_epi32() {
+        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
+        let r = _mm256_cvttps_epi32(a);
+        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
+        assert_eq_m256i(r, e);
+
+        let a = _mm256_setr_ps(
+            f32::NEG_INFINITY,
+            f32::INFINITY,
+            f32::MIN,
+            f32::MAX,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+        );
+        let r = _mm256_cvttps_epi32(a);
+        assert_eq_m256i(r, _mm256_set1_epi32(i32::MIN));
+    }
+    test_mm256_cvttps_epi32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cvtpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvtpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_setr_pd(f64::NEG_INFINITY, f64::INFINITY, f64::MIN, f64::MAX);
+        let r = _mm256_cvtpd_epi32(a);
+        assert_eq_m128i(r, _mm_set1_epi32(i32::MIN));
+    }
+    test_mm256_cvtpd_epi32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_cvttpd_epi32() {
+        let a = _mm256_setr_pd(4., 9., 16., 25.);
+        let r = _mm256_cvttpd_epi32(a);
+        let e = _mm_setr_epi32(4, 9, 16, 25);
+        assert_eq_m128i(r, e);
+
+        let a = _mm256_setr_pd(f64::NEG_INFINITY, f64::INFINITY, f64::MIN, f64::MAX);
+        let r = _mm256_cvttpd_epi32(a);
+        assert_eq_m128i(r, _mm_set1_epi32(i32::MIN));
+    }
+    test_mm256_cvttpd_epi32();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_permutevar_ps() {
+        let a = _mm_setr_ps(4., 3., 2., 5.);
+        let b = _mm_setr_epi32(1, 2, 3, 4);
+        let r = _mm_permutevar_ps(a, b);
+        let e = _mm_setr_ps(3., 2., 5., 4.);
+        assert_eq_m128(r, e);
+    }
+    test_mm_permutevar_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_permutevar_ps() {
+        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
+        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        let r = _mm256_permutevar_ps(a, b);
+        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_permutevar_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_permutevar_pd() {
+        let a = _mm_setr_pd(4., 3.);
+        let b = _mm_setr_epi64x(3, 0);
+        let r = _mm_permutevar_pd(a, b);
+        let e = _mm_setr_pd(3., 4.);
+        assert_eq_m128d(r, e);
+    }
+    test_mm_permutevar_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_permutevar_pd() {
+        let a = _mm256_setr_pd(4., 3., 2., 5.);
+        let b = _mm256_setr_epi64x(1, 2, 3, 4);
+        let r = _mm256_permutevar_pd(a, b);
+        let e = _mm256_setr_pd(4., 3., 5., 2.);
+        assert_eq_m256d(r, e);
+    }
+    test_mm256_permutevar_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_ps() {
+        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
+        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
+        let r = _mm256_permute2f128_ps::<0x13>(a, b);
+        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
+        assert_eq_m256(r, e);
+
+        let r = _mm256_permute2f128_ps::<0x44>(a, b);
+        let e = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_permute2f128_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_permute2f128_pd::<0x31>(a, b);
+        let e = _mm256_setr_pd(3., 4., 7., 8.);
+        assert_eq_m256d(r, e);
+
+        let r = _mm256_permute2f128_pd::<0x44>(a, b);
+        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 0.0);
+        assert_eq_m256d(r, e);
+    }
+    test_mm256_permute2f128_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_permute2f128_si256() {
+        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
+        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
+        let r = _mm256_permute2f128_si256::<0x20>(a, b);
+        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+        assert_eq_m256i(r, e);
+
+        let r = _mm256_permute2f128_si256::<0x44>(a, b);
+        let e = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0);
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_permute2f128_si256();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4.];
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_ps(a.as_ptr(), mask);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1.0f32, 2., 3., 4.]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let r = _mm_maskload_ps(a.as_ptr().cast(), mask);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2.0f32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let r = _mm_maskload_ps(a.as_ptr(), mask);
+        let e = _mm_setr_ps(2.0, 0.0, 0.0, 0.0);
+        assert_eq_m128(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2.0f32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let r = _mm_maskload_ps(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm_setr_ps(0.0, 0.0, 0.0, 2.0);
+        assert_eq_m128(r, e);
+    }
+    test_mm_maskload_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_maskload_pd() {
+        let a = &[1.0f64, 2.];
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(a.as_ptr(), mask);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1.0f64, 2.]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(a.as_ptr().cast(), mask);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2.0f64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let r = _mm_maskload_pd(a.as_ptr(), mask);
+        let e = _mm_setr_pd(2.0, 0.0);
+        assert_eq_m128d(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2.0f64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let r = _mm_maskload_pd(a.as_ptr().wrapping_sub(1), mask);
+        let e = _mm_setr_pd(0.0, 2.0);
+        assert_eq_m128d(r, e);
+    }
+    test_mm_maskload_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_maskload_ps() {
+        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_ps(a.as_ptr(), mask);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1.0f32, 2., 3., 4., 5., 6., 7., 8.]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let r = _mm256_maskload_ps(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2.0f32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let r = _mm256_maskload_ps(a.as_ptr(), mask);
+        let e = _mm256_setr_ps(2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0);
+        assert_eq_m256(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2.0f32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let r = _mm256_maskload_ps(a.as_ptr().wrapping_sub(7), mask);
+        let e = _mm256_setr_ps(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
+        assert_eq_m256(r, e);
+    }
+    test_mm256_maskload_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_maskload_pd() {
+        let a = &[1.0f64, 2., 3., 4.];
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_pd(a.as_ptr(), mask);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+
+        // Unaligned pointer
+        let a = Unaligned::new([1.0f64, 2., 3., 4.]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let r = _mm256_maskload_pd(a.as_ptr().cast(), mask);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+
+        // Only loading first element, so slice can be short.
+        let a = &[2.0f64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let r = _mm256_maskload_pd(a.as_ptr(), mask);
+        let e = _mm256_setr_pd(2.0, 0.0, 0.0, 0.0);
+        assert_eq_m256d(r, e);
+
+        // Only loading last element, so slice can be short.
+        let a = &[2.0f64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let r = _mm256_maskload_pd(a.as_ptr().wrapping_sub(3), mask);
+        let e = _mm256_setr_pd(0.0, 0.0, 0.0, 2.0);
+        assert_eq_m256d(r, e);
+    }
+    test_mm256_maskload_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_maskstore_ps() {
+        let mut r = _mm_set1_ps(0.);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm_setr_ps(0., 2., 0., 4.);
+        assert_eq_m128(r, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0.0f32; 4]);
+        let mask = _mm_setr_epi32(0, !0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(r.as_mut_ptr().cast(), mask, a);
+        let e = [0., 2., 0., 4.];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0.0f32];
+        let mask = _mm_setr_epi32(!0, 0, 0, 0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(r.as_mut_ptr(), mask, a);
+        let e = [1.0f32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0.0f32];
+        let mask = _mm_setr_epi32(0, 0, 0, !0);
+        let a = _mm_setr_ps(1., 2., 3., 4.);
+        _mm_maskstore_ps(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4.0f32];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_maskstore_pd() {
+        let mut r = _mm_set1_pd(0.);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm_setr_pd(0., 2.);
+        assert_eq_m128d(r, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0.0f64; 2]);
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(r.as_mut_ptr().cast(), mask, a);
+        let e = [0., 2.];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0.0f64];
+        let mask = _mm_setr_epi64x(!0, 0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(r.as_mut_ptr(), mask, a);
+        let e = [1.0f64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0.0f64];
+        let mask = _mm_setr_epi64x(0, !0);
+        let a = _mm_setr_pd(1., 2.);
+        _mm_maskstore_pd(r.as_mut_ptr().wrapping_sub(1), mask, a);
+        let e = [2.0f64];
+        assert_eq!(r, e);
+    }
+    test_mm_maskstore_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_maskstore_ps() {
+        let mut r = _mm256_set1_ps(0.);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
+        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
+        assert_eq_m256(r, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0.0f32; 8]);
+        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(r.as_mut_ptr().cast(), mask, a);
+        let e = [0., 2., 0., 4., 0., 6., 0., 8.];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0.0f32];
+        let mask = _mm256_setr_epi32(!0, 0, 0, 0, 0, 0, 0, 0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(r.as_mut_ptr(), mask, a);
+        let e = [1.0f32];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0.0f32];
+        let mask = _mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, !0);
+        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
+        _mm256_maskstore_ps(r.as_mut_ptr().wrapping_sub(7), mask, a);
+        let e = [8.0f32];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_maskstore_pd() {
+        let mut r = _mm256_set1_pd(0.);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
+        let e = _mm256_setr_pd(0., 2., 0., 4.);
+        assert_eq_m256d(r, e);
+
+        // Unaligned pointer
+        let mut r = Unaligned::new([0.0f64; 4]);
+        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(r.as_mut_ptr().cast(), mask, a);
+        let e = [0., 2., 0., 4.];
+        assert_eq!(r.read(), e);
+
+        // Only storing first element, so slice can be short.
+        let mut r = [0.0f64];
+        let mask = _mm256_setr_epi64x(!0, 0, 0, 0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(r.as_mut_ptr(), mask, a);
+        let e = [1.0f64];
+        assert_eq!(r, e);
+
+        // Only storing last element, so slice can be short.
+        let mut r = [0.0f64];
+        let mask = _mm256_setr_epi64x(0, 0, 0, !0);
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        _mm256_maskstore_pd(r.as_mut_ptr().wrapping_sub(3), mask, a);
+        let e = [4.0f64];
+        assert_eq!(r, e);
+    }
+    test_mm256_maskstore_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_lddqu_si256() {
+        #[rustfmt::skip]
+        let a = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        let p = &a as *const _;
+        let r = _mm256_lddqu_si256(p);
+        #[rustfmt::skip]
+        let e = _mm256_setr_epi8(
+            1, 2, 3, 4, 5, 6, 7, 8,
+            9, 10, 11, 12, 13, 14, 15, 16,
+            17, 18, 19, 20, 21, 22, 23, 24,
+            25, 26, 27, 28, 29, 30, 31, 32,
+        );
+        assert_eq_m256i(r, e);
+    }
+    test_mm256_lddqu_si256();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testz_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testz_si256(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm256_testz_si256();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 0);
+        let b = _mm256_set1_epi64x(0);
+        let r = _mm256_testc_si256(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm256_testc_si256();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testnzc_si256() {
+        let a = _mm256_setr_epi64x(1, 2, 3, 4);
+        let b = _mm256_setr_epi64x(5, 6, 7, 8);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_setr_epi64x(0, 0, 0, 0);
+        let b = _mm256_setr_epi64x(0, 0, 0, 0);
+        let r = _mm256_testnzc_si256(a, b);
+        assert_eq!(r, 0);
+    }
+    test_mm256_testnzc_si256();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testz_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(-1.);
+        let r = _mm256_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+    test_mm256_testz_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_pd(1.);
+        let b = _mm256_set1_pd(-1.);
+        let r = _mm256_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+    test_mm256_testc_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testnzc_pd() {
+        let a = _mm256_setr_pd(1., 2., 3., 4.);
+        let b = _mm256_setr_pd(5., 6., 7., 8.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_pd(1., -1., -1., -1.);
+        let b = _mm256_setr_pd(-1., -1., 1., 1.);
+        let r = _mm256_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm256_testnzc_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testz_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testz_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(-1.);
+        let r = _mm_testz_pd(a, a);
+        assert_eq!(r, 0);
+    }
+    test_mm_testz_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 1);
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(-1.);
+        let r = _mm_testc_pd(a, b);
+        assert_eq!(r, 0);
+    }
+    test_mm_testc_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testnzc_pd() {
+        let a = _mm_setr_pd(1., 2.);
+        let b = _mm_setr_pd(5., 6.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 0);
+        let a = _mm_setr_pd(1., -1.);
+        let b = _mm_setr_pd(-1., -1.);
+        let r = _mm_testnzc_pd(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm_testnzc_pd();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testz_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm256_set1_ps(-1.);
+        let r = _mm256_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+    test_mm256_testz_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm256_set1_ps(-1.);
+        let r = _mm256_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+    test_mm256_testc_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm256_testnzc_ps() {
+        let a = _mm256_set1_ps(1.);
+        let r = _mm256_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
+        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
+        let r = _mm256_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm256_testnzc_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testz_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 1);
+        let a = _mm_set1_ps(-1.);
+        let r = _mm_testz_ps(a, a);
+        assert_eq!(r, 0);
+    }
+    test_mm_testz_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testc_ps(a, a);
+        assert_eq!(r, 1);
+        let b = _mm_set1_ps(-1.);
+        let r = _mm_testc_ps(a, b);
+        assert_eq!(r, 0);
+    }
+    test_mm_testc_ps();
+
+    #[target_feature(enable = "avx")]
+    unsafe fn test_mm_testnzc_ps() {
+        let a = _mm_set1_ps(1.);
+        let r = _mm_testnzc_ps(a, a);
+        assert_eq!(r, 0);
+        let a = _mm_setr_ps(1., -1., -1., -1.);
+        let b = _mm_setr_ps(-1., -1., 1., 1.);
+        let r = _mm_testnzc_ps(a, b);
+        assert_eq!(r, 1);
+    }
+    test_mm_testnzc_ps();
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn _mm_setr_epi64x(a: i64, b: i64) -> __m128i {
+    _mm_set_epi64x(b, a)
+}
+
+#[track_caller]
+#[target_feature(enable = "sse")]
+unsafe fn assert_eq_m128(a: __m128, b: __m128) {
+    let r = _mm_cmpeq_ps(a, b);
+    if _mm_movemask_ps(r) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
+    if _mm_movemask_pd(_mm_cmpeq_pd(a, b)) != 0b11 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "sse2")]
+unsafe fn assert_eq_m128i(a: __m128i, b: __m128i) {
+    assert_eq!(transmute::<_, [u64; 2]>(a), transmute::<_, [u64; 2]>(b))
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256(a: __m256, b: __m256) {
+    let cmp = _mm256_cmp_ps::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_ps(cmp) != 0b11111111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256d(a: __m256d, b: __m256d) {
+    let cmp = _mm256_cmp_pd::<_CMP_EQ_OQ>(a, b);
+    if _mm256_movemask_pd(cmp) != 0b1111 {
+        panic!("{:?} != {:?}", a, b);
+    }
+}
+
+#[track_caller]
+#[target_feature(enable = "avx")]
+unsafe fn assert_eq_m256i(a: __m256i, b: __m256i) {
+    assert_eq!(transmute::<_, [u64; 4]>(a), transmute::<_, [u64; 4]>(b))
+}
+
+/// Stores `T` in an unaligned address
+struct Unaligned<T: Copy> {
+    buf: Vec<u8>,
+    offset: bool,
+    _marker: std::marker::PhantomData<T>,
+}
+
+impl<T: Copy> Unaligned<T> {
+    fn new(value: T) -> Self {
+        // Allocate extra byte for unalignment headroom
+        let len = std::mem::size_of::<T>();
+        let mut buf = Vec::<u8>::with_capacity(len + 1);
+        // Force the address to be a non-multiple of 2, so it is as unaligned as it can get.
+        let offset = (buf.as_ptr() as usize % 2) == 0;
+        let value_ptr: *const T = &value;
+        unsafe {
+            buf.as_mut_ptr().add(offset.into()).copy_from_nonoverlapping(value_ptr.cast(), len);
+        }
+        Self { buf, offset, _marker: std::marker::PhantomData }
+    }
+
+    fn as_ptr(&self) -> *const T {
+        unsafe { self.buf.as_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn as_mut_ptr(&mut self) -> *mut T {
+        unsafe { self.buf.as_mut_ptr().add(self.offset.into()).cast() }
+    }
+
+    fn read(&self) -> T {
+        unsafe { self.as_ptr().read_unaligned() }
+    }
 }