From b5f9d43ff1139fb5dbd1a919dbf63e48c2c56012 Mon Sep 17 00:00:00 2001
From: Thom Chiovoloni <thom@shift.click>
Date: Thu, 21 Jul 2022 14:53:07 -0700
Subject: [PATCH 01/70] rust-lang/portable-simd#289: Strengthen warnings about
 relying on Mask layout

This makes it more clear that you can't rely on the layout of these,
which seems worth doing given that the names vaguely suggest that you can
(and the docs only clarify that you can't on Mask but not the maskNxM aliases).
---
 crates/core_simd/src/masks.rs | 76 ++++++++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index c36c336d8a216..9953502173573 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -83,7 +83,9 @@ impl_element! { isize }
 ///
 /// Masks represent boolean inclusion/exclusion on a per-lane basis.
 ///
-/// The layout of this type is unspecified.
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[T; LANES]`.
 #[repr(transparent)]
 pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
 where
@@ -521,57 +523,129 @@ where
 }
 
 /// A mask for SIMD vectors with eight elements of 8 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i8; 8]`.
 pub type mask8x8 = Mask<i8, 8>;
 
 /// A mask for SIMD vectors with 16 elements of 8 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i8; 16]`.
 pub type mask8x16 = Mask<i8, 16>;
 
 /// A mask for SIMD vectors with 32 elements of 8 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i8; 32]`.
 pub type mask8x32 = Mask<i8, 32>;
 
 /// A mask for SIMD vectors with 64 elements of 8 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i8; 64]`.
 pub type mask8x64 = Mask<i8, 64>;
 
 /// A mask for SIMD vectors with four elements of 16 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i16; 4]`.
 pub type mask16x4 = Mask<i16, 4>;
 
 /// A mask for SIMD vectors with eight elements of 16 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i16; 8]`.
 pub type mask16x8 = Mask<i16, 8>;
 
 /// A mask for SIMD vectors with 16 elements of 16 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i16; 16]`.
 pub type mask16x16 = Mask<i16, 16>;
 
 /// A mask for SIMD vectors with 32 elements of 16 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i16; 32]`.
 pub type mask16x32 = Mask<i16, 32>;
 
 /// A mask for SIMD vectors with two elements of 32 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i32; 2]`.
 pub type mask32x2 = Mask<i32, 2>;
 
 /// A mask for SIMD vectors with four elements of 32 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i32; 4]`.
 pub type mask32x4 = Mask<i32, 4>;
 
 /// A mask for SIMD vectors with eight elements of 32 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i32; 8]`.
 pub type mask32x8 = Mask<i32, 8>;
 
 /// A mask for SIMD vectors with 16 elements of 32 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i32; 16]`.
 pub type mask32x16 = Mask<i32, 16>;
 
 /// A mask for SIMD vectors with two elements of 64 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i64; 2]`.
 pub type mask64x2 = Mask<i64, 2>;
 
 /// A mask for SIMD vectors with four elements of 64 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i64; 4]`.
 pub type mask64x4 = Mask<i64, 4>;
 
 /// A mask for SIMD vectors with eight elements of 64 bits.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[i64; 8]`.
 pub type mask64x8 = Mask<i64, 8>;
 
 /// A mask for SIMD vectors with two elements of pointer width.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[isize; 2]`.
 pub type masksizex2 = Mask<isize, 2>;
 
 /// A mask for SIMD vectors with four elements of pointer width.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[isize; 4]`.
 pub type masksizex4 = Mask<isize, 4>;
 
 /// A mask for SIMD vectors with eight elements of pointer width.
+///
+/// The layout of this type is unspecified, and may change between platforms
+/// and/or Rust versions, and code should not assume that it is equivalent to
+/// `[isize; 8]`.
 pub type masksizex8 = Mask<isize, 8>;
 
 macro_rules! impl_from {

From ddede9fb9b5bd3a7cce71775ac8ce7bd30fdf87a Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Fri, 22 Jul 2022 09:39:23 -0400
Subject: [PATCH 02/70] make some Miri backtraces more pretty

---
 crates/core_simd/src/vector.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 8661be938d5bb..e8e8f6899d349 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -239,6 +239,7 @@ where
     ///
     /// [cast]: Simd::cast
     #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
     where
         T: core::convert::FloatToInt<I>,
@@ -349,6 +350,7 @@ where
     /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
     #[must_use]
     #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn gather_select_unchecked(
         slice: &[T],
         enable: Mask<isize, LANES>,
@@ -444,6 +446,7 @@ where
     /// ```
     /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
     #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn scatter_select_unchecked(
         self,
         slice: &mut [T],

From 3183afb6b5fcbf688bb90cf1db3f635406f868dc Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 29 Jul 2022 11:57:05 -0400
Subject: [PATCH 03/70] Fix interleave/deinterleave for vectors with only one
 lane

---
 crates/core_simd/src/swizzle.rs   | 12 ++++++++++--
 crates/core_simd/tests/swizzle.rs | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 22999d24950f8..02567252a6375 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -325,7 +325,11 @@ where
             const INDEX: [Which; LANES] = hi::<LANES>();
         }
 
-        (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
+        if LANES == 1 {
+            (self, other)
+        } else {
+            (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
+        }
     }
 
     /// Deinterleave two vectors.
@@ -380,6 +384,10 @@ where
             const INDEX: [Which; LANES] = odd::<LANES>();
         }
 
-        (Even::swizzle2(self, other), Odd::swizzle2(self, other))
+        if LANES == 1 {
+            (self, other)
+        } else {
+            (Even::swizzle2(self, other), Odd::swizzle2(self, other))
+        }
     }
 }
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 51c63611aba6b..33a7becb42128 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -60,3 +60,17 @@ fn interleave() {
     assert_eq!(even, a);
     assert_eq!(odd, b);
 }
+
+// portable-simd#298
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn interleave_one() {
+    let a = Simd::from_array([0]);
+    let b = Simd::from_array([1]);
+    let (lo, hi) = a.interleave(b);
+    assert_eq!(lo.to_array(), [0]);
+    assert_eq!(hi.to_array(), [1]);
+    let (even, odd) = lo.deinterleave(hi);
+    assert_eq!(even, a);
+    assert_eq!(odd, b);
+}

From 8742a86b1da28c1bb7f0e7f663becde9b0c5a73e Mon Sep 17 00:00:00 2001
From: Jacob Lifshay <programmerjake@gmail.com>
Date: Fri, 29 Jul 2022 16:12:24 -0700
Subject: [PATCH 04/70] add all_lane_counts feature to enable non-power-of-2
 lane counts <= 64

---
 .github/workflows/ci.yml           |   4 +
 crates/core_simd/Cargo.toml        |   1 +
 crates/core_simd/src/lane_count.rs |  36 ++--
 crates/test_helpers/Cargo.toml     |   3 +
 crates/test_helpers/src/lib.rs     | 279 ++++++++++++++++++++---------
 5 files changed, 221 insertions(+), 102 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d50dfa1be4cba..acd47a3da72b2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -241,6 +241,10 @@ jobs:
           - "--features std"
           - "--features generic_const_exprs"
           - "--features std --features generic_const_exprs"
+          - "--features all_lane_counts"
+          - "--features all_lane_counts --features std"
+          - "--features all_lane_counts --features generic_const_exprs"
+          - "--features all_lane_counts --features std --features generic_const_exprs"
 
     steps:
       - uses: actions/checkout@v2
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index 8a29cf15696ed..7435e24edd30f 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -13,6 +13,7 @@ default = ["as_crate"]
 as_crate = []
 std = []
 generic_const_exprs = []
+all_lane_counts = []
 
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen]
 version = "0.2"
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 63723e2ec13c4..2b91eb9e80047 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -23,24 +23,20 @@ pub trait SupportedLaneCount: Sealed {
 
 impl<const LANES: usize> Sealed for LaneCount<LANES> {}
 
-impl SupportedLaneCount for LaneCount<1> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<2> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<4> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<8> {
-    type BitMask = [u8; 1];
-}
-impl SupportedLaneCount for LaneCount<16> {
-    type BitMask = [u8; 2];
-}
-impl SupportedLaneCount for LaneCount<32> {
-    type BitMask = [u8; 4];
-}
-impl SupportedLaneCount for LaneCount<64> {
-    type BitMask = [u8; 8];
+macro_rules! supported_lane_count {
+    ($($lanes:literal),+) => {
+        $(
+            impl SupportedLaneCount for LaneCount<$lanes> {
+                type BitMask = [u8; ($lanes + 7) / 8];
+            }
+        )+
+    };
 }
+
+supported_lane_count!(1, 2, 4, 8, 16, 32, 64);
+#[cfg(feature = "all_lane_counts")]
+supported_lane_count!(
+    3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+    31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+    56, 57, 58, 59, 60, 61, 62, 63
+);
diff --git a/crates/test_helpers/Cargo.toml b/crates/test_helpers/Cargo.toml
index a04b0961d7f70..1d2bc8b519aa6 100644
--- a/crates/test_helpers/Cargo.toml
+++ b/crates/test_helpers/Cargo.toml
@@ -8,3 +8,6 @@ publish = false
 version = "0.10"
 default-features = false
 features = ["alloc"]
+
+[features]
+all_lane_counts = []
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 141bee18a9a40..650eadd12bfdf 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -333,6 +333,39 @@ pub fn test_ternary_elementwise<
     );
 }
 
+#[doc(hidden)]
+#[macro_export]
+macro_rules! test_lanes_helper {
+    ($($(#[$meta:meta])* $fn_name:ident $lanes:literal;)+) => {
+        $(
+            #[test]
+            $(#[$meta])*
+            fn $fn_name() {
+                implementation::<$lanes>();
+            }
+        )+
+    };
+    (
+        $(#[$meta:meta])+;
+        $($(#[$meta_before:meta])+ $fn_name_before:ident $lanes_before:literal;)*
+        $fn_name:ident $lanes:literal;
+        $($fn_name_rest:ident $lanes_rest:literal;)*
+    ) => {
+        $crate::test_lanes_helper!(
+            $(#[$meta])+;
+            $($(#[$meta_before])+ $fn_name_before $lanes_before;)*
+            $(#[$meta])+ $fn_name $lanes;
+            $($fn_name_rest $lanes_rest;)*
+        );
+    };
+    (
+        $(#[$meta_ignored:meta])+;
+        $($(#[$meta:meta])+ $fn_name:ident $lanes:literal;)+
+    ) => {
+        $crate::test_lanes_helper!($($(#[$meta])+ $fn_name $lanes;)+);
+    };
+}
+
 /// Expand a const-generic test into separate tests for each possible lane count.
 #[macro_export]
 macro_rules! test_lanes {
@@ -351,51 +384,90 @@ macro_rules! test_lanes {
                 #[cfg(target_arch = "wasm32")]
                 wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_1() {
-                    implementation::<1>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_2() {
-                    implementation::<2>();
-                }
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_1 1;
+                    lanes_2 2;
+                    lanes_4 4;
+                );
 
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                fn lanes_4() {
-                    implementation::<4>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_8() {
-                    implementation::<8>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_16() {
-                    implementation::<16>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_8 8;
+                    lanes_16 16;
+                    lanes_32 32;
+                    lanes_64 64;
+                );
+
+                #[cfg(feature = "all_lane_counts")]
+                $crate::test_lanes_helper!(
+                    // test some odd and even non-power-of-2 lengths on miri
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_3 3;
+                    lanes_5 5;
+                    lanes_6 6;
+                );
+
+                #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_32() {
-                    implementation::<32>();
-                }
-
-                #[test]
-                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                fn lanes_64() {
-                    implementation::<64>();
-                }
+                $crate::test_lanes_helper!(
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_7 7;
+                    lanes_9 9;
+                    lanes_10 10;
+                    lanes_11 11;
+                    lanes_12 12;
+                    lanes_13 13;
+                    lanes_14 14;
+                    lanes_15 15;
+                    lanes_17 17;
+                    lanes_18 18;
+                    lanes_19 19;
+                    lanes_20 20;
+                    lanes_21 21;
+                    lanes_22 22;
+                    lanes_23 23;
+                    lanes_24 24;
+                    lanes_25 25;
+                    lanes_26 26;
+                    lanes_27 27;
+                    lanes_28 28;
+                    lanes_29 29;
+                    lanes_30 30;
+                    lanes_31 31;
+                    lanes_33 33;
+                    lanes_34 34;
+                    lanes_35 35;
+                    lanes_36 36;
+                    lanes_37 37;
+                    lanes_38 38;
+                    lanes_39 39;
+                    lanes_40 40;
+                    lanes_41 41;
+                    lanes_42 42;
+                    lanes_43 43;
+                    lanes_44 44;
+                    lanes_45 45;
+                    lanes_46 46;
+                    lanes_47 47;
+                    lanes_48 48;
+                    lanes_49 49;
+                    lanes_50 50;
+                    lanes_51 51;
+                    lanes_52 52;
+                    lanes_53 53;
+                    lanes_54 54;
+                    lanes_55 55;
+                    lanes_56 56;
+                    lanes_57 57;
+                    lanes_58 58;
+                    lanes_59 59;
+                    lanes_60 60;
+                    lanes_61 61;
+                    lanes_62 62;
+                    lanes_63 63;
+                );
             }
         )*
     }
@@ -416,47 +488,90 @@ macro_rules! test_lanes_panic {
                     core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
                 $body
 
-                #[test]
-                #[should_panic]
-                fn lanes_1() {
-                    implementation::<1>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_2() {
-                    implementation::<2>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_4() {
-                    implementation::<4>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_8() {
-                    implementation::<8>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_16() {
-                    implementation::<16>();
-                }
-
-                #[test]
-                #[should_panic]
-                fn lanes_32() {
-                    implementation::<32>();
-                }
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_1 1;
+                    lanes_2 2;
+                    lanes_4 4;
+                );
 
-                #[test]
-                #[should_panic]
-                fn lanes_64() {
-                    implementation::<64>();
-                }
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_8 8;
+                    lanes_16 16;
+                    lanes_32 32;
+                    lanes_64 64;
+                );
+
+                #[cfg(feature = "all_lane_counts")]
+                $crate::test_lanes_helper!(
+                    // test some odd and even non-power-of-2 lengths on miri
+                    #[should_panic];
+                    lanes_3 3;
+                    lanes_5 5;
+                    lanes_6 6;
+                );
+
+                #[cfg(feature = "all_lane_counts")]
+                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
+                $crate::test_lanes_helper!(
+                    #[should_panic];
+                    lanes_7 7;
+                    lanes_9 9;
+                    lanes_10 10;
+                    lanes_11 11;
+                    lanes_12 12;
+                    lanes_13 13;
+                    lanes_14 14;
+                    lanes_15 15;
+                    lanes_17 17;
+                    lanes_18 18;
+                    lanes_19 19;
+                    lanes_20 20;
+                    lanes_21 21;
+                    lanes_22 22;
+                    lanes_23 23;
+                    lanes_24 24;
+                    lanes_25 25;
+                    lanes_26 26;
+                    lanes_27 27;
+                    lanes_28 28;
+                    lanes_29 29;
+                    lanes_30 30;
+                    lanes_31 31;
+                    lanes_33 33;
+                    lanes_34 34;
+                    lanes_35 35;
+                    lanes_36 36;
+                    lanes_37 37;
+                    lanes_38 38;
+                    lanes_39 39;
+                    lanes_40 40;
+                    lanes_41 41;
+                    lanes_42 42;
+                    lanes_43 43;
+                    lanes_44 44;
+                    lanes_45 45;
+                    lanes_46 46;
+                    lanes_47 47;
+                    lanes_48 48;
+                    lanes_49 49;
+                    lanes_50 50;
+                    lanes_51 51;
+                    lanes_52 52;
+                    lanes_53 53;
+                    lanes_54 54;
+                    lanes_55 55;
+                    lanes_56 56;
+                    lanes_57 57;
+                    lanes_58 58;
+                    lanes_59 59;
+                    lanes_60 60;
+                    lanes_61 61;
+                    lanes_62 62;
+                    lanes_63 63;
+                );
             }
         )*
     }

From 6bf512823548b4fdbb7127489e883bff8a98b33f Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 1 Aug 2022 00:34:58 -0400
Subject: [PATCH 05/70] Simplify interleave/deinterleave and fix for odd-length
 vectors.

---
 crates/core_simd/src/swizzle.rs | 74 ++++++++++++---------------------
 1 file changed, 26 insertions(+), 48 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 02567252a6375..0b66b8a0ae03b 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -265,13 +265,10 @@ where
 
     /// Interleave two vectors.
     ///
-    /// Produces two vectors with lanes taken alternately from `self` and `other`.
+    /// The resulting vectors contain lanes taken alternatively from `self` and `other`, first
+    /// filling the first result, and then the second.
     ///
-    /// The first result contains the first `LANES / 2` lanes from `self` and `other`,
-    /// alternating, starting with the first lane of `self`.
-    ///
-    /// The second result contains the last `LANES / 2` lanes from `self` and `other`,
-    /// alternating, starting with the lane `LANES / 2` from the start of `self`.
+    /// The reverse of this operation is [`Simd::deinterleave`].
     ///
     /// ```
     /// #![feature(portable_simd)]
@@ -285,29 +282,17 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
-        const fn lo<const LANES: usize>() -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
-            let mut i = 0;
-            while i < LANES {
-                let offset = i / 2;
-                idx[i] = if i % 2 == 0 {
-                    Which::First(offset)
-                } else {
-                    Which::Second(offset)
-                };
-                i += 1;
-            }
-            idx
-        }
-        const fn hi<const LANES: usize>() -> [Which; LANES] {
+        const fn interleave<const LANES: usize>(high: bool) -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
             let mut i = 0;
             while i < LANES {
-                let offset = (LANES + i) / 2;
-                idx[i] = if i % 2 == 0 {
-                    Which::First(offset)
+                // Treat the source as a concatenated vector
+                let dst_index = if high { i + LANES } else { i };
+                let src_index = dst_index / 2 + (dst_index % 2) * LANES;
+                idx[i] = if src_index < LANES {
+                    Which::First(src_index)
                 } else {
-                    Which::Second(offset)
+                    Which::Second(src_index % LANES)
                 };
                 i += 1;
             }
@@ -318,18 +303,14 @@ where
         struct Hi;
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Lo {
-            const INDEX: [Which; LANES] = lo::<LANES>();
+            const INDEX: [Which; LANES] = interleave::<LANES>(false);
         }
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Hi {
-            const INDEX: [Which; LANES] = hi::<LANES>();
+            const INDEX: [Which; LANES] = interleave::<LANES>(true);
         }
 
-        if LANES == 1 {
-            (self, other)
-        } else {
-            (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
-        }
+        (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
     }
 
     /// Deinterleave two vectors.
@@ -340,6 +321,8 @@ where
     /// The second result takes every other lane of `self` and then `other`, starting with
     /// the second lane.
     ///
+    /// The reverse of this operation is [`Simd::interleave`].
+    ///
     /// ```
     /// #![feature(portable_simd)]
     /// # use core::simd::Simd;
@@ -352,22 +335,17 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
-        const fn even<const LANES: usize>() -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
-            let mut i = 0;
-            while i < LANES / 2 {
-                idx[i] = Which::First(2 * i);
-                idx[i + LANES / 2] = Which::Second(2 * i);
-                i += 1;
-            }
-            idx
-        }
-        const fn odd<const LANES: usize>() -> [Which; LANES] {
+        const fn deinterleave<const LANES: usize>(second: bool) -> [Which; LANES] {
             let mut idx = [Which::First(0); LANES];
             let mut i = 0;
-            while i < LANES / 2 {
-                idx[i] = Which::First(2 * i + 1);
-                idx[i + LANES / 2] = Which::Second(2 * i + 1);
+            while i < LANES {
+                // Treat the source as a concatenated vector
+                let src_index = i * 2 + if second { 1 } else { 0 };
+                idx[i] = if src_index < LANES {
+                    Which::First(src_index)
+                } else {
+                    Which::Second(src_index % LANES)
+                };
                 i += 1;
             }
             idx
@@ -377,11 +355,11 @@ where
         struct Odd;
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Even {
-            const INDEX: [Which; LANES] = even::<LANES>();
+            const INDEX: [Which; LANES] = deinterleave::<LANES>(false);
         }
 
         impl<const LANES: usize> Swizzle2<LANES, LANES> for Odd {
-            const INDEX: [Which; LANES] = odd::<LANES>();
+            const INDEX: [Which; LANES] = deinterleave::<LANES>(true);
         }
 
         if LANES == 1 {

From c739af3908613ba3f611dce115525e2f2f91bfca Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 1 Aug 2022 00:38:29 -0400
Subject: [PATCH 06/70] Hide rustc unstable feature from docs

---
 crates/core_simd/src/swizzle.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 0b66b8a0ae03b..72cce7aeb048f 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -271,7 +271,7 @@ where
     /// The reverse of this operation is [`Simd::deinterleave`].
     ///
     /// ```
-    /// #![feature(portable_simd)]
+    /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 1, 2, 3]);
     /// let b = Simd::from_array([4, 5, 6, 7]);
@@ -324,7 +324,7 @@ where
     /// The reverse of this operation is [`Simd::interleave`].
     ///
     /// ```
-    /// #![feature(portable_simd)]
+    /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let a = Simd::from_array([0, 4, 1, 5]);
     /// let b = Simd::from_array([2, 6, 3, 7]);

From d030301161a372b545e5d8c1784cba113e5a8ebd Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 1 Aug 2022 19:52:35 -0400
Subject: [PATCH 07/70] Remove special case for length-1 vectors

---
 crates/core_simd/src/swizzle.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 72cce7aeb048f..61cc604e4cd48 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -362,10 +362,6 @@ where
             const INDEX: [Which; LANES] = deinterleave::<LANES>(true);
         }
 
-        if LANES == 1 {
-            (self, other)
-        } else {
-            (Even::swizzle2(self, other), Odd::swizzle2(self, other))
-        }
+        (Even::swizzle2(self, other), Odd::swizzle2(self, other))
     }
 }

From 5f7066430b9239cfe8243ddba4c29416f002ae6b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 1 Aug 2022 19:57:41 -0400
Subject: [PATCH 08/70] Simplify expression

---
 crates/core_simd/src/swizzle.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 61cc604e4cd48..68f20516cf5bc 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -340,7 +340,7 @@ where
             let mut i = 0;
             while i < LANES {
                 // Treat the source as a concatenated vector
-                let src_index = i * 2 + if second { 1 } else { 0 };
+                let src_index = i * 2 + second as usize;
                 idx[i] = if src_index < LANES {
                     Which::First(src_index)
                 } else {

From 2c5ebfb6a26d384bc21db6796095890c1f13f19c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?=
 <miguelraz@ciencias.unam.mx>
Date: Fri, 30 Sep 2022 20:25:34 -0500
Subject: [PATCH 09/70] add feature flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

couldn't run the `hellosimd` without it 🤷🏾
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index db0af2da60641..791051f69aeba 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@ core_simd = { git = "https://github.com/rust-lang/portable-simd" }
 
 and finally write this in `src/main.rs`:
 ```rust
+#![feature(portable_simd)]
 use core_simd::*;
 fn main() {
     let a = f32x4::splat(10.0);

From 4491309cb01cc917ef455c41b0dcf9cf5900aa35 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 16 Oct 2022 13:31:42 -0400
Subject: [PATCH 10/70] Mark more mask functions inline

---
 crates/core_simd/src/masks.rs            | 9 +++++++++
 crates/core_simd/src/masks/bitmask.rs    | 4 ++++
 crates/core_simd/src/masks/full_masks.rs | 4 ++++
 crates/core_simd/src/masks/to_bitmask.rs | 4 ++++
 4 files changed, 21 insertions(+)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 9953502173573..7fd50fed4478c 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -55,6 +55,7 @@ pub unsafe trait MaskElement: SimdElement + Sealed {}
 macro_rules! impl_element {
     { $ty:ty } => {
         impl Sealed for $ty {
+            #[inline]
             fn valid<const LANES: usize>(value: Simd<Self, LANES>) -> bool
             where
                 LaneCount<LANES>: SupportedLaneCount,
@@ -62,6 +63,7 @@ macro_rules! impl_element {
                 (value.simd_eq(Simd::splat(0 as _)) | value.simd_eq(Simd::splat(-1 as _))).all()
             }
 
+            #[inline]
             fn eq(self, other: Self) -> bool { self == other }
 
             const TRUE: Self = -1;
@@ -104,6 +106,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -115,11 +118,13 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     /// Construct a mask by setting all lanes to the given value.
+    #[inline]
     pub fn splat(value: bool) -> Self {
         Self(mask_impl::Mask::splat(value))
     }
 
     /// Converts an array of bools to a SIMD mask.
+    #[inline]
     pub fn from_array(array: [bool; LANES]) -> Self {
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
         //     true:    0b_0000_0001
@@ -136,6 +141,7 @@ where
     }
 
     /// Converts a SIMD mask to an array of bools.
+    #[inline]
     pub fn to_array(self) -> [bool; LANES] {
         // This follows mostly the same logic as from_array.
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
@@ -263,6 +269,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(array: [bool; LANES]) -> Self {
         Self::from_array(array)
     }
@@ -273,6 +280,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(vector: Mask<T, LANES>) -> Self {
         vector.to_array()
     }
@@ -655,6 +663,7 @@ macro_rules! impl_from {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn from(value: Mask<$from, LANES>) -> Self {
                 value.cast()
             }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 365ecc0a3251e..20465ba9b07ec 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -26,6 +26,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -36,6 +37,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn eq(&self, other: &Self) -> bool {
         self.0.as_ref() == other.0.as_ref()
     }
@@ -46,6 +48,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
         self.0.as_ref().partial_cmp(other.0.as_ref())
     }
@@ -63,6 +66,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
         self.0.as_ref().cmp(other.0.as_ref())
     }
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index adf0fcbeae2bd..bcedd2df2253a 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -37,6 +37,7 @@ where
     T: MaskElement + PartialEq,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn eq(&self, other: &Self) -> bool {
         self.0.eq(&other.0)
     }
@@ -47,6 +48,7 @@ where
     T: MaskElement + PartialOrd,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
         self.0.partial_cmp(&other.0)
     }
@@ -64,6 +66,7 @@ where
     T: MaskElement + Ord,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
         self.0.cmp(&other.0)
     }
@@ -262,6 +265,7 @@ where
     T: MaskElement,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn from(value: Mask<T, LANES>) -> Self {
         value.0
     }
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 65d3ce9be65ec..46914dfe0d9b4 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -48,10 +48,12 @@ macro_rules! impl_integer_intrinsic {
         impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
             type BitMask = $int;
 
+            #[inline]
             fn to_bitmask(self) -> $int {
                 self.0.to_bitmask_integer()
             }
 
+            #[inline]
             fn from_bitmask(bitmask: $int) -> Self {
                 Self(mask_impl::Mask::from_bitmask_integer(bitmask))
             }
@@ -83,10 +85,12 @@ where
 {
     const BYTES: usize = bitmask_len(LANES);
 
+    #[inline]
     fn to_bitmask_array(self) -> [u8; Self::BYTES] {
         self.0.to_bitmask_array()
     }
 
+    #[inline]
     fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self {
         Mask(mask_impl::Mask::from_bitmask_array(bitmask))
     }

From ee9a23facb7871218f5f0bf596f77e27586187a9 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 16 Oct 2022 13:52:08 -0400
Subject: [PATCH 11/70] Update readme

---
 README.md | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 791051f69aeba..4c1b4062100cc 100644
--- a/README.md
+++ b/README.md
@@ -24,20 +24,10 @@ or by setting up `rustup default nightly` or else with `cargo +nightly {build,te
 ```bash
 cargo new hellosimd
 ```
-to create a new crate. Edit `hellosimd/Cargo.toml` to be 
-```toml
-[package]
-name = "hellosimd"
-version = "0.1.0"
-edition = "2018"
-[dependencies]
-core_simd = { git = "https://github.com/rust-lang/portable-simd" }
-```
-
-and finally write this in `src/main.rs`:
+to create a new crate. Finally write this in `src/main.rs`:
 ```rust
 #![feature(portable_simd)]
-use core_simd::*;
+use std::simd::f32x4;
 fn main() {
     let a = f32x4::splat(10.0);
     let b = f32x4::from_array([1.0, 2.0, 3.0, 4.0]);
@@ -45,24 +35,22 @@ fn main() {
 }
 ```
 
-Explanation: We import all the bindings from the crate with the first line. Then, we construct our SIMD vectors with methods like `splat` or `from_array`. Finally, we can use operators on them like `+` and the appropriate SIMD instructions will be carried out. When we run `cargo run` you should get `[11.0, 12.0, 13.0, 14.0]`.
-
-## Code Organization
+Explanation: We construct our SIMD vectors with methods like `splat` or `from_array`. Next, we can use operators like `+` on them, and the appropriate SIMD instructions will be carried out. When we run `cargo run` you should get `[11.0, 12.0, 13.0, 14.0]`.
 
-Currently the crate is organized so that each element type is a file, and then the 64-bit, 128-bit, 256-bit, and 512-bit vectors using those types are contained in said file.
+## Supported vectors
 
-All types are then exported as a single, flat module.
+Currently, vectors may have up to 64 elements, but aliases are provided only up to 512-bit vectors.
 
 Depending on the size of the primitive type, the number of lanes the vector will have varies. For example, 128-bit vectors have four `f32` lanes and two `f64` lanes.
 
 The supported element types are as follows:
 * **Floating Point:** `f32`, `f64`
-* **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `i128`, `isize`
-* **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `u128`, `usize`
-* **Masks:** `mask8`, `mask16`, `mask32`, `mask64`, `mask128`, `masksize`
+* **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `isize` (`i128` excluded)
+* **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `usize` (`u128` excluded)
+* **Masks:** 8-bit, 16-bit, 32-bit, 64-bit, and `usize`-sized masks
 
 Floating point, signed integers, and unsigned integers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
-The `mask` types are "truthy" values, but they use the number of bits in their name instead of just 1 bit like a normal `bool` uses.
+The mask types are "truthy" values, like `bool`, but have an unspecified layout in the vector type and cannot be constructed outside of a vector.
 
 [simd-guide]: ./beginners-guide.md
 [zulip-project-portable-simd]: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd

From f236f5745a0058bd85e044fe3252b87676843018 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 16 Oct 2022 18:08:17 -0400
Subject: [PATCH 12/70] Update README.md

Co-authored-by: Jacob Lifshay <programmerjake@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4c1b4062100cc..80313157ea2c9 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ The supported element types are as follows:
 * **Masks:** 8-bit, 16-bit, 32-bit, 64-bit, and `usize`-sized masks
 
 Floating point, signed integers, and unsigned integers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
-The mask types are "truthy" values, like `bool`, but have an unspecified layout in the vector type and cannot be constructed outside of a vector.
+The mask types have elements that are "truthy" values, like `bool`, but have an unspecified layout because different architectures prefer different layouts for mask types.
 
 [simd-guide]: ./beginners-guide.md
 [zulip-project-portable-simd]: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd

From 61a6f1854f453bb1003b08358b9478eba7fd6ad8 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 16 Oct 2022 21:38:13 -0400
Subject: [PATCH 13/70] Specify aliases in one place, and make it more uniform
 which are defined

---
 crates/core_simd/src/alias.rs        | 227 +++++++++++++++++++++++++++
 crates/core_simd/src/masks.rs        | 126 ---------------
 crates/core_simd/src/mod.rs          |   2 +
 crates/core_simd/src/vector.rs       |   8 -
 crates/core_simd/src/vector/float.rs |  24 ---
 crates/core_simd/src/vector/int.rs   |  63 --------
 crates/core_simd/src/vector/uint.rs  |  63 --------
 7 files changed, 229 insertions(+), 284 deletions(-)
 create mode 100644 crates/core_simd/src/alias.rs
 delete mode 100644 crates/core_simd/src/vector/float.rs
 delete mode 100644 crates/core_simd/src/vector/int.rs
 delete mode 100644 crates/core_simd/src/vector/uint.rs

diff --git a/crates/core_simd/src/alias.rs b/crates/core_simd/src/alias.rs
new file mode 100644
index 0000000000000..b4d5f45208a1f
--- /dev/null
+++ b/crates/core_simd/src/alias.rs
@@ -0,0 +1,227 @@
+macro_rules! number {
+    { 1 } => { "one" };
+    { 2 } => { "two" };
+    { 4 } => { "four" };
+    { 8 } => { "eight" };
+    { $x:literal } => { stringify!($x) };
+}
+
+macro_rules! plural {
+    { 1 } => { "" };
+    { $x:literal } => { "s" };
+}
+
+macro_rules! alias {
+    {
+        $(
+            $element:ty = {
+                $($alias:ident $elements:tt)*
+            }
+        )*
+    } => {
+        $(
+            $(
+            #[doc = concat!("A SIMD vector with ", number!($elements), " element", plural!($elements), " of type [`", stringify!($element), "`].")]
+            #[allow(non_camel_case_types)]
+            pub type $alias = $crate::simd::Simd<$element, $elements>;
+            )*
+        )*
+    }
+}
+
+macro_rules! mask_alias {
+    {
+        $(
+            $element:ty : $size:literal = {
+                $($alias:ident $elements:tt)*
+            }
+        )*
+    } => {
+        $(
+            $(
+            #[doc = concat!("A SIMD mask with ", number!($elements), " element", plural!($elements), " for vectors with ", $size, " element types.")]
+            ///
+            #[doc = concat!(
+                "The layout of this type is unspecified, and may change between platforms and/or Rust versions, and code should not assume that it is equivalent to `[",
+                stringify!($element), "; ", $elements, "]`."
+            )]
+            #[allow(non_camel_case_types)]
+            pub type $alias = $crate::simd::Mask<$element, $elements>;
+            )*
+        )*
+    }
+}
+
+alias! {
+    i8 = {
+        i8x1 1
+        i8x2 2
+        i8x4 4
+        i8x8 8
+        i8x16 16
+        i8x32 32
+        i8x64 64
+    }
+
+    i16 = {
+        i16x1 1
+        i16x2 2
+        i16x4 4
+        i16x8 8
+        i16x16 16
+        i16x32 32
+        i16x64 64
+    }
+
+    i32 = {
+        i32x1 1
+        i32x2 2
+        i32x4 4
+        i32x8 8
+        i32x16 16
+        i32x32 32
+        i32x64 64
+    }
+
+    i64 = {
+        i64x1 1
+        i64x2 2
+        i64x4 4
+        i64x8 8
+        i64x16 16
+        i64x32 32
+        i64x64 64
+    }
+
+    isize = {
+        isizex1 1
+        isizex2 2
+        isizex4 4
+        isizex8 8
+        isizex16 16
+        isizex32 32
+        isizex64 64
+    }
+
+    u8 = {
+        u8x1 1
+        u8x2 2
+        u8x4 4
+        u8x8 8
+        u8x16 16
+        u8x32 32
+        u8x64 64
+    }
+
+    u16 = {
+        u16x1 1
+        u16x2 2
+        u16x4 4
+        u16x8 8
+        u16x16 16
+        u16x32 32
+        u16x64 64
+    }
+
+    u32 = {
+        u32x1 1
+        u32x2 2
+        u32x4 4
+        u32x8 8
+        u32x16 16
+        u32x32 32
+        u32x64 64
+    }
+
+    u64 = {
+        u64x1 1
+        u64x2 2
+        u64x4 4
+        u64x8 8
+        u64x16 16
+        u64x32 32
+        u64x64 64
+    }
+
+    usize = {
+        usizex1 1
+        usizex2 2
+        usizex4 4
+        usizex8 8
+        usizex16 16
+        usizex32 32
+        usizex64 64
+    }
+
+    f32 = {
+        f32x1 1
+        f32x2 2
+        f32x4 4
+        f32x8 8
+        f32x16 16
+        f32x32 32
+        f32x64 64
+    }
+
+    f64 = {
+        f64x1 1
+        f64x2 2
+        f64x4 4
+        f64x8 8
+        f64x16 16
+        f64x32 32
+        f64x64 64
+    }
+}
+
+mask_alias! {
+    i8 : "8-bit" = {
+        mask8x1 1
+        mask8x2 2
+        mask8x4 4
+        mask8x8 8
+        mask8x16 16
+        mask8x32 32
+        mask8x64 64
+    }
+
+    i16 : "16-bit" = {
+        mask16x1 1
+        mask16x2 2
+        mask16x4 4
+        mask16x8 8
+        mask16x16 16
+        mask16x32 32
+        mask16x64 64
+    }
+
+    i32 : "32-bit" = {
+        mask32x1 1
+        mask32x2 2
+        mask32x4 4
+        mask32x8 8
+        mask32x16 16
+        mask32x32 32
+        mask32x64 64
+    }
+
+    i64 : "64-bit" = {
+        mask64x1 1
+        mask64x2 2
+        mask64x4 4
+        mask64x8 8
+        mask64x16 16
+        mask64x32 32
+        mask64x64 64
+    }
+
+    isize : "pointer-sized" = {
+        masksizex1 1
+        masksizex2 2
+        masksizex4 4
+        masksizex8 8
+        masksizex16 16
+        masksizex32 32
+        masksizex64 64
+    }
+}
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 7fd50fed4478c..e58df80fca8b5 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -530,132 +530,6 @@ where
     }
 }
 
-/// A mask for SIMD vectors with eight elements of 8 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i8; 8]`.
-pub type mask8x8 = Mask<i8, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 8 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i8; 16]`.
-pub type mask8x16 = Mask<i8, 16>;
-
-/// A mask for SIMD vectors with 32 elements of 8 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i8; 32]`.
-pub type mask8x32 = Mask<i8, 32>;
-
-/// A mask for SIMD vectors with 64 elements of 8 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i8; 64]`.
-pub type mask8x64 = Mask<i8, 64>;
-
-/// A mask for SIMD vectors with four elements of 16 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i16; 4]`.
-pub type mask16x4 = Mask<i16, 4>;
-
-/// A mask for SIMD vectors with eight elements of 16 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i16; 8]`.
-pub type mask16x8 = Mask<i16, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 16 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i16; 16]`.
-pub type mask16x16 = Mask<i16, 16>;
-
-/// A mask for SIMD vectors with 32 elements of 16 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i16; 32]`.
-pub type mask16x32 = Mask<i16, 32>;
-
-/// A mask for SIMD vectors with two elements of 32 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i32; 2]`.
-pub type mask32x2 = Mask<i32, 2>;
-
-/// A mask for SIMD vectors with four elements of 32 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i32; 4]`.
-pub type mask32x4 = Mask<i32, 4>;
-
-/// A mask for SIMD vectors with eight elements of 32 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i32; 8]`.
-pub type mask32x8 = Mask<i32, 8>;
-
-/// A mask for SIMD vectors with 16 elements of 32 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i32; 16]`.
-pub type mask32x16 = Mask<i32, 16>;
-
-/// A mask for SIMD vectors with two elements of 64 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i64; 2]`.
-pub type mask64x2 = Mask<i64, 2>;
-
-/// A mask for SIMD vectors with four elements of 64 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i64; 4]`.
-pub type mask64x4 = Mask<i64, 4>;
-
-/// A mask for SIMD vectors with eight elements of 64 bits.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[i64; 8]`.
-pub type mask64x8 = Mask<i64, 8>;
-
-/// A mask for SIMD vectors with two elements of pointer width.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[isize; 2]`.
-pub type masksizex2 = Mask<isize, 2>;
-
-/// A mask for SIMD vectors with four elements of pointer width.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[isize; 4]`.
-pub type masksizex4 = Mask<isize, 4>;
-
-/// A mask for SIMD vectors with eight elements of pointer width.
-///
-/// The layout of this type is unspecified, and may change between platforms
-/// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[isize; 8]`.
-pub type masksizex8 = Mask<isize, 8>;
-
 macro_rules! impl_from {
     { $from:ty  => $($to:ty),* } => {
         $(
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index b472aa3abe210..9909d63987423 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -6,6 +6,7 @@ pub(crate) mod intrinsics;
 #[cfg(feature = "generic_const_exprs")]
 mod to_bytes;
 
+mod alias;
 mod elements;
 mod eq;
 mod fmt;
@@ -22,6 +23,7 @@ mod vendor;
 pub mod simd {
     pub(crate) use crate::core_simd::intrinsics;
 
+    pub use crate::core_simd::alias::*;
     pub use crate::core_simd::elements::*;
     pub use crate::core_simd::eq::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index e8e8f6899d349..7f0e8350cf866 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,11 +1,3 @@
-mod float;
-mod int;
-mod uint;
-
-pub use float::*;
-pub use int::*;
-pub use uint::*;
-
 // Vectors of pointers are not for public use at the current time.
 pub(crate) mod ptr;
 
diff --git a/crates/core_simd/src/vector/float.rs b/crates/core_simd/src/vector/float.rs
deleted file mode 100644
index f836c99b1e2dc..0000000000000
--- a/crates/core_simd/src/vector/float.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A 64-bit SIMD vector with two elements of type `f32`.
-pub type f32x2 = Simd<f32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `f32`.
-pub type f32x4 = Simd<f32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `f32`.
-pub type f32x8 = Simd<f32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `f32`.
-pub type f32x16 = Simd<f32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `f64`.
-pub type f64x2 = Simd<f64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `f64`.
-pub type f64x4 = Simd<f64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `f64`.
-pub type f64x8 = Simd<f64, 8>;
diff --git a/crates/core_simd/src/vector/int.rs b/crates/core_simd/src/vector/int.rs
deleted file mode 100644
index 20e56c7dc6443..0000000000000
--- a/crates/core_simd/src/vector/int.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A SIMD vector with two elements of type `isize`.
-pub type isizex2 = Simd<isize, 2>;
-
-/// A SIMD vector with four elements of type `isize`.
-pub type isizex4 = Simd<isize, 4>;
-
-/// A SIMD vector with eight elements of type `isize`.
-pub type isizex8 = Simd<isize, 8>;
-
-/// A 32-bit SIMD vector with two elements of type `i16`.
-pub type i16x2 = Simd<i16, 2>;
-
-/// A 64-bit SIMD vector with four elements of type `i16`.
-pub type i16x4 = Simd<i16, 4>;
-
-/// A 128-bit SIMD vector with eight elements of type `i16`.
-pub type i16x8 = Simd<i16, 8>;
-
-/// A 256-bit SIMD vector with 16 elements of type `i16`.
-pub type i16x16 = Simd<i16, 16>;
-
-/// A 512-bit SIMD vector with 32 elements of type `i16`.
-pub type i16x32 = Simd<i16, 32>;
-
-/// A 64-bit SIMD vector with two elements of type `i32`.
-pub type i32x2 = Simd<i32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `i32`.
-pub type i32x4 = Simd<i32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `i32`.
-pub type i32x8 = Simd<i32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `i32`.
-pub type i32x16 = Simd<i32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `i64`.
-pub type i64x2 = Simd<i64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `i64`.
-pub type i64x4 = Simd<i64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `i64`.
-pub type i64x8 = Simd<i64, 8>;
-
-/// A 32-bit SIMD vector with four elements of type `i8`.
-pub type i8x4 = Simd<i8, 4>;
-
-/// A 64-bit SIMD vector with eight elements of type `i8`.
-pub type i8x8 = Simd<i8, 8>;
-
-/// A 128-bit SIMD vector with 16 elements of type `i8`.
-pub type i8x16 = Simd<i8, 16>;
-
-/// A 256-bit SIMD vector with 32 elements of type `i8`.
-pub type i8x32 = Simd<i8, 32>;
-
-/// A 512-bit SIMD vector with 64 elements of type `i8`.
-pub type i8x64 = Simd<i8, 64>;
diff --git a/crates/core_simd/src/vector/uint.rs b/crates/core_simd/src/vector/uint.rs
deleted file mode 100644
index b4a69c44363f1..0000000000000
--- a/crates/core_simd/src/vector/uint.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-#![allow(non_camel_case_types)]
-
-use crate::simd::Simd;
-
-/// A SIMD vector with two elements of type `usize`.
-pub type usizex2 = Simd<usize, 2>;
-
-/// A SIMD vector with four elements of type `usize`.
-pub type usizex4 = Simd<usize, 4>;
-
-/// A SIMD vector with eight elements of type `usize`.
-pub type usizex8 = Simd<usize, 8>;
-
-/// A 32-bit SIMD vector with two elements of type `u16`.
-pub type u16x2 = Simd<u16, 2>;
-
-/// A 64-bit SIMD vector with four elements of type `u16`.
-pub type u16x4 = Simd<u16, 4>;
-
-/// A 128-bit SIMD vector with eight elements of type `u16`.
-pub type u16x8 = Simd<u16, 8>;
-
-/// A 256-bit SIMD vector with 16 elements of type `u16`.
-pub type u16x16 = Simd<u16, 16>;
-
-/// A 512-bit SIMD vector with 32 elements of type `u16`.
-pub type u16x32 = Simd<u16, 32>;
-
-/// A 64-bit SIMD vector with two elements of type `u32`.
-pub type u32x2 = Simd<u32, 2>;
-
-/// A 128-bit SIMD vector with four elements of type `u32`.
-pub type u32x4 = Simd<u32, 4>;
-
-/// A 256-bit SIMD vector with eight elements of type `u32`.
-pub type u32x8 = Simd<u32, 8>;
-
-/// A 512-bit SIMD vector with 16 elements of type `u32`.
-pub type u32x16 = Simd<u32, 16>;
-
-/// A 128-bit SIMD vector with two elements of type `u64`.
-pub type u64x2 = Simd<u64, 2>;
-
-/// A 256-bit SIMD vector with four elements of type `u64`.
-pub type u64x4 = Simd<u64, 4>;
-
-/// A 512-bit SIMD vector with eight elements of type `u64`.
-pub type u64x8 = Simd<u64, 8>;
-
-/// A 32-bit SIMD vector with four elements of type `u8`.
-pub type u8x4 = Simd<u8, 4>;
-
-/// A 64-bit SIMD vector with eight elements of type `u8`.
-pub type u8x8 = Simd<u8, 8>;
-
-/// A 128-bit SIMD vector with 16 elements of type `u8`.
-pub type u8x16 = Simd<u8, 16>;
-
-/// A 256-bit SIMD vector with 32 elements of type `u8`.
-pub type u8x32 = Simd<u8, 32>;
-
-/// A 512-bit SIMD vector with 64 elements of type `u8`.
-pub type u8x64 = Simd<u8, 64>;

From 402b50a2728ec4dd9a6da2e57b25cce3ffb48f06 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 16 Oct 2022 23:46:18 -0400
Subject: [PATCH 14/70] Improve variable names

---
 crates/core_simd/src/alias.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/crates/core_simd/src/alias.rs b/crates/core_simd/src/alias.rs
index b4d5f45208a1f..23f121c46197c 100644
--- a/crates/core_simd/src/alias.rs
+++ b/crates/core_simd/src/alias.rs
@@ -14,16 +14,16 @@ macro_rules! plural {
 macro_rules! alias {
     {
         $(
-            $element:ty = {
-                $($alias:ident $elements:tt)*
+            $element_ty:ty = {
+                $($alias:ident $num_elements:tt)*
             }
         )*
     } => {
         $(
             $(
-            #[doc = concat!("A SIMD vector with ", number!($elements), " element", plural!($elements), " of type [`", stringify!($element), "`].")]
+            #[doc = concat!("A SIMD vector with ", number!($num_elements), " element", plural!($num_elements), " of type [`", stringify!($element_ty), "`].")]
             #[allow(non_camel_case_types)]
-            pub type $alias = $crate::simd::Simd<$element, $elements>;
+            pub type $alias = $crate::simd::Simd<$element_ty, $num_elements>;
             )*
         )*
     }
@@ -32,21 +32,21 @@ macro_rules! alias {
 macro_rules! mask_alias {
     {
         $(
-            $element:ty : $size:literal = {
-                $($alias:ident $elements:tt)*
+            $element_ty:ty : $size:literal = {
+                $($alias:ident $num_elements:tt)*
             }
         )*
     } => {
         $(
             $(
-            #[doc = concat!("A SIMD mask with ", number!($elements), " element", plural!($elements), " for vectors with ", $size, " element types.")]
+            #[doc = concat!("A SIMD mask with ", number!($num_elements), " element", plural!($num_elements), " for vectors with ", $size, " element types.")]
             ///
             #[doc = concat!(
                 "The layout of this type is unspecified, and may change between platforms and/or Rust versions, and code should not assume that it is equivalent to `[",
-                stringify!($element), "; ", $elements, "]`."
+                stringify!($element_ty), "; ", $num_elements, "]`."
             )]
             #[allow(non_camel_case_types)]
-            pub type $alias = $crate::simd::Mask<$element, $elements>;
+            pub type $alias = $crate::simd::Mask<$element_ty, $num_elements>;
             )*
         )*
     }

From d3cfd7c5c9dba01a8f31b10cef4a1985ae1dc53f Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 21 Jun 2022 23:17:13 -0400
Subject: [PATCH 15/70] Add vectors of pointers

---
 crates/core_simd/src/cast.rs               |  45 ++++++++
 crates/core_simd/src/elements.rs           |   4 +
 crates/core_simd/src/elements/const_ptr.rs |  59 +++++++++++
 crates/core_simd/src/elements/mut_ptr.rs   |  57 +++++++++++
 crates/core_simd/src/eq.rs                 |  42 ++++++++
 crates/core_simd/src/mod.rs                |   2 +
 crates/core_simd/src/ord.rs                | 114 +++++++++++++++++++++
 crates/core_simd/src/vector.rs             |  19 +++-
 8 files changed, 339 insertions(+), 3 deletions(-)
 create mode 100644 crates/core_simd/src/cast.rs
 create mode 100644 crates/core_simd/src/elements/const_ptr.rs
 create mode 100644 crates/core_simd/src/elements/mut_ptr.rs

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
new file mode 100644
index 0000000000000..e04a9042b1bf3
--- /dev/null
+++ b/crates/core_simd/src/cast.rs
@@ -0,0 +1,45 @@
+use crate::simd::SimdElement;
+
+/// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
+pub trait SimdCast<Target: SimdElement>: SimdElement {}
+
+macro_rules! into_number {
+    { $($type:ty),* } => {
+        $(
+        impl SimdCast<i8> for $type {}
+        impl SimdCast<i16> for $type {}
+        impl SimdCast<i32> for $type {}
+        impl SimdCast<i64> for $type {}
+        impl SimdCast<isize> for $type {}
+
+        impl SimdCast<u8> for $type {}
+        impl SimdCast<u16> for $type {}
+        impl SimdCast<u32> for $type {}
+        impl SimdCast<u64> for $type {}
+        impl SimdCast<usize> for $type {}
+
+        impl SimdCast<f32> for $type {}
+        impl SimdCast<f64> for $type {}
+        )*
+    }
+}
+
+into_number! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, f32, f64 }
+
+macro_rules! into_pointer {
+    { $($type:ty),* } => {
+        $(
+        impl<T> SimdCast<$type> for *const T {}
+        impl<T> SimdCast<$type> for *mut T {}
+        impl<T> SimdCast<*const T> for $type {}
+        impl<T> SimdCast<*mut T> for $type {}
+        )*
+    }
+}
+
+into_pointer! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
+
+impl<T, U> SimdCast<*const T> for *const U {}
+impl<T, U> SimdCast<*const T> for *mut U {}
+impl<T, U> SimdCast<*mut T> for *const U {}
+impl<T, U> SimdCast<*mut T> for *mut U {}
diff --git a/crates/core_simd/src/elements.rs b/crates/core_simd/src/elements.rs
index 701eb66b248af..dc7f52a4d576c 100644
--- a/crates/core_simd/src/elements.rs
+++ b/crates/core_simd/src/elements.rs
@@ -1,11 +1,15 @@
+mod const_ptr;
 mod float;
 mod int;
+mod mut_ptr;
 mod uint;
 
 mod sealed {
     pub trait Sealed {}
 }
 
+pub use const_ptr::*;
 pub use float::*;
 pub use int::*;
+pub use mut_ptr::*;
 pub use uint::*;
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
new file mode 100644
index 0000000000000..ab6b5b8b5f4df
--- /dev/null
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -0,0 +1,59 @@
+use super::sealed::Sealed;
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+
+/// Operations on SIMD vectors of constant pointers.
+pub trait SimdConstPtr: Copy + Sealed {
+    /// Vector type representing the pointers as bits.
+    type Bits;
+
+    /// Vector of mutable pointers to the same type.
+    type MutPtr;
+
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Returns `true` for each lane that is null.
+    fn is_null(self) -> Self::Mask;
+
+    /// Changes constness without changing the type.
+    fn as_mut(self) -> Self::MutPtr;
+
+    /// Cast pointers to raw bits.
+    fn to_bits(self) -> Self::Bits;
+
+    /// Cast raw bits to pointers.
+    fn from_bits(bits: Self::Bits) -> Self;
+}
+
+impl<T, const LANES: usize> Sealed for Simd<*const T, LANES> where
+    LaneCount<LANES>: SupportedLaneCount
+{
+}
+
+impl<T, const LANES: usize> SimdConstPtr for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Bits = Simd<usize, LANES>;
+    type MutPtr = Simd<*mut T, LANES>;
+    type Mask = Mask<isize, LANES>;
+
+    fn is_null(self) -> Self::Mask {
+        Simd::splat(core::ptr::null()).simd_eq(self)
+    }
+
+    fn as_mut(self) -> Self::MutPtr {
+        // Converting between pointers is safe
+        unsafe { intrinsics::simd_as(self) }
+    }
+
+    fn to_bits(self) -> Self::Bits {
+        // Casting pointers to usize is safe
+        unsafe { intrinsics::simd_as(self) }
+    }
+
+    fn from_bits(bits: Self::Bits) -> Self {
+        // Casting usize to pointers is safe
+        unsafe { intrinsics::simd_as(bits) }
+    }
+}
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
new file mode 100644
index 0000000000000..b49f9fda7e44a
--- /dev/null
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -0,0 +1,57 @@
+use super::sealed::Sealed;
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+
+/// Operations on SIMD vectors of mutable pointers.
+pub trait SimdMutPtr: Copy + Sealed {
+    /// Vector type representing the pointers as bits.
+    type Bits;
+
+    /// Vector of constant pointers to the same type.
+    type ConstPtr;
+
+    /// Mask type used for manipulating this SIMD vector type.
+    type Mask;
+
+    /// Returns `true` for each lane that is null.
+    fn is_null(self) -> Self::Mask;
+
+    /// Changes constness without changing the type.
+    fn as_const(self) -> Self::ConstPtr;
+
+    /// Cast pointers to raw bits.
+    fn to_bits(self) -> Self::Bits;
+
+    /// Cast raw bits to pointers.
+    fn from_bits(bits: Self::Bits) -> Self;
+}
+
+impl<T, const LANES: usize> Sealed for Simd<*mut T, LANES> where LaneCount<LANES>: SupportedLaneCount
+{}
+
+impl<T, const LANES: usize> SimdMutPtr for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Bits = Simd<usize, LANES>;
+    type ConstPtr = Simd<*const T, LANES>;
+    type Mask = Mask<isize, LANES>;
+
+    fn is_null(self) -> Self::Mask {
+        Simd::splat(core::ptr::null_mut()).simd_eq(self)
+    }
+
+    fn as_const(self) -> Self::ConstPtr {
+        // Converting between pointers is safe
+        unsafe { intrinsics::simd_as(self) }
+    }
+
+    fn to_bits(self) -> Self::Bits {
+        // Casting pointers to usize is safe
+        unsafe { intrinsics::simd_as(self) }
+    }
+
+    fn from_bits(bits: Self::Bits) -> Self {
+        // Casting usize to pointers is safe
+        unsafe { intrinsics::simd_as(bits) }
+    }
+}
diff --git a/crates/core_simd/src/eq.rs b/crates/core_simd/src/eq.rs
index c7111f720a8ac..149380746e713 100644
--- a/crates/core_simd/src/eq.rs
+++ b/crates/core_simd/src/eq.rs
@@ -71,3 +71,45 @@ macro_rules! impl_mask {
 }
 
 impl_mask! { i8, i16, i32, i64, isize }
+
+impl<T, const LANES: usize> SimdPartialEq for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn simd_eq(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+    }
+
+    #[inline]
+    fn simd_ne(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+    }
+}
+
+impl<T, const LANES: usize> SimdPartialEq for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Mask = Mask<isize, LANES>;
+
+    #[inline]
+    fn simd_eq(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+    }
+
+    #[inline]
+    fn simd_ne(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+    }
+}
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index 9909d63987423..ece026a448b73 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod intrinsics;
 mod to_bytes;
 
 mod alias;
+mod cast;
 mod elements;
 mod eq;
 mod fmt;
@@ -24,6 +25,7 @@ pub mod simd {
     pub(crate) use crate::core_simd::intrinsics;
 
     pub use crate::core_simd::alias::*;
+    pub use crate::core_simd::cast::*;
     pub use crate::core_simd::elements::*;
     pub use crate::core_simd::eq::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
index 9a87bc2e34460..95a1ecaeeda75 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/ord.rs
@@ -211,3 +211,117 @@ macro_rules! impl_mask {
 }
 
 impl_mask! { i8, i16, i32, i64, isize }
+
+impl<T, const LANES: usize> SimdPartialOrd for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_lt(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+    }
+
+    #[inline]
+    fn simd_le(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+    }
+
+    #[inline]
+    fn simd_gt(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+    }
+
+    #[inline]
+    fn simd_ge(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+    }
+}
+
+impl<T, const LANES: usize> SimdOrd for Simd<*const T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_max(self, other: Self) -> Self {
+        self.simd_lt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_min(self, other: Self) -> Self {
+        self.simd_gt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_clamp(self, min: Self, max: Self) -> Self {
+        assert!(
+            min.simd_le(max).all(),
+            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+        );
+        self.simd_max(min).simd_min(max)
+    }
+}
+
+impl<T, const LANES: usize> SimdPartialOrd for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_lt(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+    }
+
+    #[inline]
+    fn simd_le(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+    }
+
+    #[inline]
+    fn simd_gt(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+    }
+
+    #[inline]
+    fn simd_ge(self, other: Self) -> Self::Mask {
+        // Safety: `self` is a vector, and the result of the comparison
+        // is always a valid mask.
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+    }
+}
+
+impl<T, const LANES: usize> SimdOrd for Simd<*mut T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn simd_max(self, other: Self) -> Self {
+        self.simd_lt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_min(self, other: Self) -> Self {
+        self.simd_gt(other).select(other, self)
+    }
+
+    #[inline]
+    fn simd_clamp(self, min: Self, max: Self) -> Self {
+        assert!(
+            min.simd_le(max).all(),
+            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+        );
+        self.simd_max(min).simd_min(max)
+    }
+}
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 7f0e8350cf866..cbc8ced5a84e8 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -2,7 +2,7 @@
 pub(crate) mod ptr;
 
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdPartialOrd, SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdPartialOrd, SupportedLaneCount, Swizzle,
 };
 
 /// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
@@ -211,7 +211,10 @@ where
     #[must_use]
     #[inline]
     #[cfg(not(bootstrap))]
-    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES> {
+    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES>
+    where
+        T: SimdCast<U>,
+    {
         // Safety: The input argument is a vector of a valid SIMD element type.
         unsafe { intrinsics::simd_as(self) }
     }
@@ -234,7 +237,7 @@ where
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
     where
-        T: core::convert::FloatToInt<I>,
+        T: core::convert::FloatToInt<I> + SimdCast<I>,
         I: SimdElement,
     {
         // Safety: `self` is a vector, and `FloatToInt` ensures the type can be casted to
@@ -739,3 +742,13 @@ impl Sealed for f64 {}
 unsafe impl SimdElement for f64 {
     type Mask = i64;
 }
+
+impl<T> Sealed for *const T {}
+unsafe impl<T> SimdElement for *const T {
+    type Mask = isize;
+}
+
+impl<T> Sealed for *mut T {}
+unsafe impl<T> SimdElement for *mut T {
+    type Mask = isize;
+}

From 7e96f5dbea3fd2291f0e835a21ed0c41f6ef086e Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 21 Jun 2022 23:20:06 -0400
Subject: [PATCH 16/70] Use safe casts

---
 crates/core_simd/src/elements/const_ptr.rs | 11 ++++-------
 crates/core_simd/src/elements/mut_ptr.rs   | 11 ++++-------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index ab6b5b8b5f4df..62365eace89a5 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
@@ -43,17 +43,14 @@ where
     }
 
     fn as_mut(self) -> Self::MutPtr {
-        // Converting between pointers is safe
-        unsafe { intrinsics::simd_as(self) }
+        self.cast()
     }
 
     fn to_bits(self) -> Self::Bits {
-        // Casting pointers to usize is safe
-        unsafe { intrinsics::simd_as(self) }
+        self.cast()
     }
 
     fn from_bits(bits: Self::Bits) -> Self {
-        // Casting usize to pointers is safe
-        unsafe { intrinsics::simd_as(bits) }
+        bits.cast()
     }
 }
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index b49f9fda7e44a..8c68d42628f94 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
@@ -41,17 +41,14 @@ where
     }
 
     fn as_const(self) -> Self::ConstPtr {
-        // Converting between pointers is safe
-        unsafe { intrinsics::simd_as(self) }
+        self.cast()
     }
 
     fn to_bits(self) -> Self::Bits {
-        // Casting pointers to usize is safe
-        unsafe { intrinsics::simd_as(self) }
+        self.cast()
     }
 
     fn from_bits(bits: Self::Bits) -> Self {
-        // Casting usize to pointers is safe
-        unsafe { intrinsics::simd_as(bits) }
+        bits.cast()
     }
 }

From 4076ba8a77326c70645f6c4a4351b0d84c5c898f Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 23 Jun 2022 01:21:58 -0400
Subject: [PATCH 17/70] Implement scatter/gather with new pointer vector and
 add tests

---
 crates/core_simd/src/cast.rs               | 132 +++++++++++++++++----
 crates/core_simd/src/elements/const_ptr.rs |  30 +++--
 crates/core_simd/src/elements/mut_ptr.rs   |  30 +++--
 crates/core_simd/src/eq.rs                 |  20 ++--
 crates/core_simd/src/ord.rs                |  36 ++----
 crates/core_simd/src/vector.rs             |  13 +-
 crates/core_simd/src/vector/ptr.rs         |  51 --------
 crates/core_simd/tests/pointers.rs         |  43 +++++++
 crates/test_helpers/src/biteq.rs           |  20 ++++
 crates/test_helpers/src/lib.rs             |  63 ++++++----
 10 files changed, 277 insertions(+), 161 deletions(-)
 delete mode 100644 crates/core_simd/src/vector/ptr.rs
 create mode 100644 crates/core_simd/tests/pointers.rs

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index e04a9042b1bf3..d62d3f6635d53 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,25 +1,41 @@
-use crate::simd::SimdElement;
+use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
 
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
-pub trait SimdCast<Target: SimdElement>: SimdElement {}
+pub trait SimdCast<Target: SimdElement>: SimdElement {
+    #[doc(hidden)]
+    fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<Target, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount;
+}
 
 macro_rules! into_number {
+    { $from:ty, $to:ty } => {
+        impl SimdCast<$to> for $from {
+            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$to, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                // Safety: simd_as can handle numeric conversions
+                unsafe { intrinsics::simd_as(x) }
+            }
+        }
+    };
     { $($type:ty),* } => {
         $(
-        impl SimdCast<i8> for $type {}
-        impl SimdCast<i16> for $type {}
-        impl SimdCast<i32> for $type {}
-        impl SimdCast<i64> for $type {}
-        impl SimdCast<isize> for $type {}
-
-        impl SimdCast<u8> for $type {}
-        impl SimdCast<u16> for $type {}
-        impl SimdCast<u32> for $type {}
-        impl SimdCast<u64> for $type {}
-        impl SimdCast<usize> for $type {}
-
-        impl SimdCast<f32> for $type {}
-        impl SimdCast<f64> for $type {}
+        into_number! { $type, i8 }
+        into_number! { $type, i16 }
+        into_number! { $type, i32 }
+        into_number! { $type, i64 }
+        into_number! { $type, isize }
+
+        into_number! { $type, u8 }
+        into_number! { $type, u16 }
+        into_number! { $type, u32 }
+        into_number! { $type, u64 }
+        into_number! { $type, usize }
+
+        into_number! { $type, f32 }
+        into_number! { $type, f64 }
         )*
     }
 }
@@ -29,17 +45,85 @@ into_number! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, f32, f64 }
 macro_rules! into_pointer {
     { $($type:ty),* } => {
         $(
-        impl<T> SimdCast<$type> for *const T {}
-        impl<T> SimdCast<$type> for *mut T {}
-        impl<T> SimdCast<*const T> for $type {}
-        impl<T> SimdCast<*mut T> for $type {}
+        impl<T> SimdCast<$type> for *const T {
+            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$type, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                // Safety: transmuting isize to pointers is safe
+                let x: Simd<isize, LANES> = unsafe { core::mem::transmute_copy(&x) };
+                x.cast()
+            }
+        }
+        impl<T> SimdCast<$type> for *mut T {
+            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$type, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                // Safety: transmuting isize to pointers is safe
+                let x: Simd<isize, LANES> = unsafe { core::mem::transmute_copy(&x) };
+                x.cast()
+            }
+        }
+        impl<T> SimdCast<*const T> for $type {
+            fn cast<const LANES: usize>(x: Simd<$type, LANES>) -> Simd<*const T, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                let x: Simd<isize, LANES> = x.cast();
+                // Safety: transmuting isize to pointers is safe
+                unsafe { core::mem::transmute_copy(&x) }
+            }
+        }
+        impl<T> SimdCast<*mut T> for $type {
+            fn cast<const LANES: usize>(x: Simd<$type, LANES>) -> Simd<*mut T, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                let x: Simd<isize, LANES> = x.cast();
+                // Safety: transmuting isize to pointers is safe
+                unsafe { core::mem::transmute_copy(&x) }
+            }
+        }
         )*
     }
 }
 
 into_pointer! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
 
-impl<T, U> SimdCast<*const T> for *const U {}
-impl<T, U> SimdCast<*const T> for *mut U {}
-impl<T, U> SimdCast<*mut T> for *const U {}
-impl<T, U> SimdCast<*mut T> for *mut U {}
+impl<T, U> SimdCast<*const T> for *const U {
+    fn cast<const LANES: usize>(x: Simd<*const U, LANES>) -> Simd<*const T, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: transmuting pointers is safe
+        unsafe { core::mem::transmute_copy(&x) }
+    }
+}
+impl<T, U> SimdCast<*const T> for *mut U {
+    fn cast<const LANES: usize>(x: Simd<*mut U, LANES>) -> Simd<*const T, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: transmuting pointers is safe
+        unsafe { core::mem::transmute_copy(&x) }
+    }
+}
+impl<T, U> SimdCast<*mut T> for *const U {
+    fn cast<const LANES: usize>(x: Simd<*const U, LANES>) -> Simd<*mut T, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: transmuting pointers is safe
+        unsafe { core::mem::transmute_copy(&x) }
+    }
+}
+impl<T, U> SimdCast<*mut T> for *mut U {
+    fn cast<const LANES: usize>(x: Simd<*mut U, LANES>) -> Simd<*mut T, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: transmuting pointers is safe
+        unsafe { core::mem::transmute_copy(&x) }
+    }
+}
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 62365eace89a5..c4a254f5ab1fa 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -3,8 +3,8 @@ use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
-    /// Vector type representing the pointers as bits.
-    type Bits;
+    /// Vector of usize with the same number of lanes.
+    type Usize;
 
     /// Vector of mutable pointers to the same type.
     type MutPtr;
@@ -18,11 +18,15 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Changes constness without changing the type.
     fn as_mut(self) -> Self::MutPtr;
 
-    /// Cast pointers to raw bits.
-    fn to_bits(self) -> Self::Bits;
+    /// Gets the "address" portion of the pointer.
+    ///
+    /// Equivalent to calling [`pointer::addr`] on each lane.
+    fn addr(self) -> Self::Usize;
 
-    /// Cast raw bits to pointers.
-    fn from_bits(bits: Self::Bits) -> Self;
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_add(self, count: Self::Usize) -> Self;
 }
 
 impl<T, const LANES: usize> Sealed for Simd<*const T, LANES> where
@@ -34,23 +38,29 @@ impl<T, const LANES: usize> SimdConstPtr for Simd<*const T, LANES>
 where
     LaneCount<LANES>: SupportedLaneCount,
 {
-    type Bits = Simd<usize, LANES>;
+    type Usize = Simd<usize, LANES>;
     type MutPtr = Simd<*mut T, LANES>;
     type Mask = Mask<isize, LANES>;
 
+    #[inline]
     fn is_null(self) -> Self::Mask {
         Simd::splat(core::ptr::null()).simd_eq(self)
     }
 
+    #[inline]
     fn as_mut(self) -> Self::MutPtr {
         self.cast()
     }
 
-    fn to_bits(self) -> Self::Bits {
+    #[inline]
+    fn addr(self) -> Self::Usize {
         self.cast()
     }
 
-    fn from_bits(bits: Self::Bits) -> Self {
-        bits.cast()
+    #[inline]
+    fn wrapping_add(self, count: Self::Usize) -> Self {
+        let addr = self.addr() + (count * Simd::splat(core::mem::size_of::<T>()));
+        // Safety: transmuting usize to pointers is safe, even if accessing those pointers isn't.
+        unsafe { core::mem::transmute_copy(&addr) }
     }
 }
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 8c68d42628f94..5920960c49cea 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -3,8 +3,8 @@ use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
-    /// Vector type representing the pointers as bits.
-    type Bits;
+    /// Vector of usize with the same number of lanes.
+    type Usize;
 
     /// Vector of constant pointers to the same type.
     type ConstPtr;
@@ -18,11 +18,15 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Changes constness without changing the type.
     fn as_const(self) -> Self::ConstPtr;
 
-    /// Cast pointers to raw bits.
-    fn to_bits(self) -> Self::Bits;
+    /// Gets the "address" portion of the pointer.
+    ///
+    /// Equivalent to calling [`pointer::addr`] on each lane.
+    fn addr(self) -> Self::Usize;
 
-    /// Cast raw bits to pointers.
-    fn from_bits(bits: Self::Bits) -> Self;
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_add(self, count: Self::Usize) -> Self;
 }
 
 impl<T, const LANES: usize> Sealed for Simd<*mut T, LANES> where LaneCount<LANES>: SupportedLaneCount
@@ -32,23 +36,29 @@ impl<T, const LANES: usize> SimdMutPtr for Simd<*mut T, LANES>
 where
     LaneCount<LANES>: SupportedLaneCount,
 {
-    type Bits = Simd<usize, LANES>;
+    type Usize = Simd<usize, LANES>;
     type ConstPtr = Simd<*const T, LANES>;
     type Mask = Mask<isize, LANES>;
 
+    #[inline]
     fn is_null(self) -> Self::Mask {
         Simd::splat(core::ptr::null_mut()).simd_eq(self)
     }
 
+    #[inline]
     fn as_const(self) -> Self::ConstPtr {
         self.cast()
     }
 
-    fn to_bits(self) -> Self::Bits {
+    #[inline]
+    fn addr(self) -> Self::Usize {
         self.cast()
     }
 
-    fn from_bits(bits: Self::Bits) -> Self {
-        bits.cast()
+    #[inline]
+    fn wrapping_add(self, count: Self::Usize) -> Self {
+        let addr = self.addr() + (count * Simd::splat(core::mem::size_of::<T>()));
+        // Safety: transmuting usize to pointers is safe, even if accessing those pointers isn't.
+        unsafe { core::mem::transmute_copy(&addr) }
     }
 }
diff --git a/crates/core_simd/src/eq.rs b/crates/core_simd/src/eq.rs
index 149380746e713..80763c0727278 100644
--- a/crates/core_simd/src/eq.rs
+++ b/crates/core_simd/src/eq.rs
@@ -1,4 +1,6 @@
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdElement, SimdMutPtr, SupportedLaneCount,
+};
 
 /// Parallel `PartialEq`.
 pub trait SimdPartialEq {
@@ -80,16 +82,12 @@ where
 
     #[inline]
     fn simd_eq(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+        self.addr().simd_eq(other.addr())
     }
 
     #[inline]
     fn simd_ne(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+        self.addr().simd_ne(other.addr())
     }
 }
 
@@ -101,15 +99,11 @@ where
 
     #[inline]
     fn simd_eq(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+        self.addr().simd_eq(other.addr())
     }
 
     #[inline]
     fn simd_ne(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+        self.addr().simd_ne(other.addr())
     }
 }
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
index 95a1ecaeeda75..1ae9cd061fb2d 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/ord.rs
@@ -1,4 +1,6 @@
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{
+    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdMutPtr, SimdPartialEq, SupportedLaneCount,
+};
 
 /// Parallel `PartialOrd`.
 pub trait SimdPartialOrd: SimdPartialEq {
@@ -218,30 +220,22 @@ where
 {
     #[inline]
     fn simd_lt(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+        self.addr().simd_lt(other.addr())
     }
 
     #[inline]
     fn simd_le(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+        self.addr().simd_le(other.addr())
     }
 
     #[inline]
     fn simd_gt(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+        self.addr().simd_gt(other.addr())
     }
 
     #[inline]
     fn simd_ge(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+        self.addr().simd_ge(other.addr())
     }
 }
 
@@ -275,30 +269,22 @@ where
 {
     #[inline]
     fn simd_lt(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+        self.addr().simd_lt(other.addr())
     }
 
     #[inline]
     fn simd_le(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+        self.addr().simd_le(other.addr())
     }
 
     #[inline]
     fn simd_gt(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+        self.addr().simd_gt(other.addr())
     }
 
     #[inline]
     fn simd_ge(self, other: Self) -> Self::Mask {
-        // Safety: `self` is a vector, and the result of the comparison
-        // is always a valid mask.
-        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+        self.addr().simd_ge(other.addr())
     }
 }
 
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index cbc8ced5a84e8..145394a519d12 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,8 +1,6 @@
-// Vectors of pointers are not for public use at the current time.
-pub(crate) mod ptr;
-
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdPartialOrd, SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdConstPtr, SimdMutPtr, SimdPartialOrd,
+    SupportedLaneCount, Swizzle,
 };
 
 /// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
@@ -215,8 +213,7 @@ where
     where
         T: SimdCast<U>,
     {
-        // Safety: The input argument is a vector of a valid SIMD element type.
-        unsafe { intrinsics::simd_as(self) }
+        SimdCast::cast(self)
     }
 
     /// Rounds toward zero and converts to the same-width integer type, assuming that
@@ -352,7 +349,7 @@ where
         idxs: Simd<usize, LANES>,
         or: Self,
     ) -> Self {
-        let base_ptr = crate::simd::ptr::SimdConstPtr::splat(slice.as_ptr());
+        let base_ptr = Simd::<*const T, LANES>::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
         // Safety: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
@@ -460,7 +457,7 @@ where
         // 3. &mut [T] which will become our base ptr.
         unsafe {
             // Now Entering ☢️ *mut T Zone
-            let base_ptr = crate::simd::ptr::SimdMutPtr::splat(slice.as_mut_ptr());
+            let base_ptr = Simd::<*mut T, LANES>::splat(slice.as_mut_ptr());
             // Ferris forgive me, I have done pointer arithmetic here.
             let ptrs = base_ptr.wrapping_add(idxs);
             // The ptrs have been bounds-masked to prevent memory-unsafe writes insha'allah
diff --git a/crates/core_simd/src/vector/ptr.rs b/crates/core_simd/src/vector/ptr.rs
deleted file mode 100644
index fa756344db91a..0000000000000
--- a/crates/core_simd/src/vector/ptr.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-//! Private implementation details of public gather/scatter APIs.
-use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount};
-
-/// A vector of *const T.
-#[derive(Debug, Copy, Clone)]
-#[repr(simd)]
-pub(crate) struct SimdConstPtr<T, const LANES: usize>([*const T; LANES]);
-
-impl<T, const LANES: usize> SimdConstPtr<T, LANES>
-where
-    LaneCount<LANES>: SupportedLaneCount,
-    T: Sized,
-{
-    #[inline]
-    #[must_use]
-    pub fn splat(ptr: *const T) -> Self {
-        Self([ptr; LANES])
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        // Safety: this intrinsic doesn't have a precondition
-        unsafe { intrinsics::simd_arith_offset(self, addend) }
-    }
-}
-
-/// A vector of *mut T. Be very careful around potential aliasing.
-#[derive(Debug, Copy, Clone)]
-#[repr(simd)]
-pub(crate) struct SimdMutPtr<T, const LANES: usize>([*mut T; LANES]);
-
-impl<T, const LANES: usize> SimdMutPtr<T, LANES>
-where
-    LaneCount<LANES>: SupportedLaneCount,
-    T: Sized,
-{
-    #[inline]
-    #[must_use]
-    pub fn splat(ptr: *mut T) -> Self {
-        Self([ptr; LANES])
-    }
-
-    #[inline]
-    #[must_use]
-    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
-        // Safety: this intrinsic doesn't have a precondition
-        unsafe { intrinsics::simd_arith_offset(self, addend) }
-    }
-}
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
new file mode 100644
index 0000000000000..df26c462f93d0
--- /dev/null
+++ b/crates/core_simd/tests/pointers.rs
@@ -0,0 +1,43 @@
+#![feature(portable_simd, strict_provenance)]
+
+use core_simd::{Simd, SimdConstPtr, SimdMutPtr};
+
+macro_rules! common_tests {
+    { $constness:ident } => {
+        test_helpers::test_lanes! {
+            fn is_null<const LANES: usize>() {
+                test_helpers::test_unary_mask_elementwise(
+                    &Simd::<*$constness (), LANES>::is_null,
+                    &<*$constness ()>::is_null,
+                    &|_| true,
+                );
+            }
+
+            fn addr<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &Simd::<*$constness (), LANES>::addr,
+                    &<*$constness ()>::addr,
+                    &|_| true,
+                );
+            }
+
+            fn wrapping_add<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness (), LANES>::wrapping_add,
+                    &<*$constness ()>::wrapping_add,
+                    &|_, _| true,
+                );
+            }
+        }
+    }
+}
+
+mod const_ptr {
+    use super::*;
+    common_tests! { const }
+}
+
+mod mut_ptr {
+    use super::*;
+    common_tests! { mut }
+}
diff --git a/crates/test_helpers/src/biteq.rs b/crates/test_helpers/src/biteq.rs
index 00350e22418d0..7d91260d838a6 100644
--- a/crates/test_helpers/src/biteq.rs
+++ b/crates/test_helpers/src/biteq.rs
@@ -55,6 +55,26 @@ macro_rules! impl_float_biteq {
 
 impl_float_biteq! { f32, f64 }
 
+impl<T> BitEq for *const T {
+    fn biteq(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl<T> BitEq for *mut T {
+    fn biteq(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
 impl<T: BitEq, const N: usize> BitEq for [T; N] {
     fn biteq(&self, other: &Self) -> bool {
         self.iter()
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 650eadd12bfdf..5f2a928b5e4bb 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -38,6 +38,28 @@ impl_num! { usize }
 impl_num! { f32 }
 impl_num! { f64 }
 
+impl<T> DefaultStrategy for *const T {
+    type Strategy = proptest::strategy::Map<proptest::num::isize::Any, fn(isize) -> *const T>;
+    fn default_strategy() -> Self::Strategy {
+        fn map<T>(x: isize) -> *const T {
+            x as _
+        }
+        use proptest::strategy::Strategy;
+        proptest::num::isize::ANY.prop_map(map)
+    }
+}
+
+impl<T> DefaultStrategy for *mut T {
+    type Strategy = proptest::strategy::Map<proptest::num::isize::Any, fn(isize) -> *mut T>;
+    fn default_strategy() -> Self::Strategy {
+        fn map<T>(x: isize) -> *mut T {
+            x as _
+        }
+        use proptest::strategy::Strategy;
+        proptest::num::isize::ANY.prop_map(map)
+    }
+}
+
 #[cfg(not(target_arch = "wasm32"))]
 impl DefaultStrategy for u128 {
     type Strategy = proptest::num::u128::Any;
@@ -135,21 +157,21 @@ pub fn test_unary_elementwise<Scalar, ScalarResult, Vector, VectorResult, const
     fs: &dyn Fn(Scalar) -> ScalarResult,
     check: &dyn Fn([Scalar; LANES]) -> bool,
 ) where
-    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Scalar: Copy + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
     Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
     VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
 {
     test_1(&|x: [Scalar; LANES]| {
         proptest::prop_assume!(check(x));
         let result_1: [ScalarResult; LANES] = fv(x.into()).into();
-        let result_2: [ScalarResult; LANES] = {
-            let mut result = [ScalarResult::default(); LANES];
-            for (i, o) in x.iter().zip(result.iter_mut()) {
-                *o = fs(*i);
-            }
-            result
-        };
+        let result_2: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .map(fs)
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
         crate::prop_assert_biteq!(result_1, result_2);
         Ok(())
     });
@@ -162,7 +184,7 @@ pub fn test_unary_mask_elementwise<Scalar, Vector, Mask, const LANES: usize>(
     fs: &dyn Fn(Scalar) -> bool,
     check: &dyn Fn([Scalar; LANES]) -> bool,
 ) where
-    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar: Copy + core::fmt::Debug + DefaultStrategy,
     Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
     Mask: Into<[bool; LANES]> + From<[bool; LANES]> + Copy,
 {
@@ -196,9 +218,9 @@ pub fn test_binary_elementwise<
     fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
     check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES]) -> bool,
 ) where
-    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
-    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Scalar1: Copy + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
     Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
     Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
     VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
@@ -206,13 +228,14 @@ pub fn test_binary_elementwise<
     test_2(&|x: [Scalar1; LANES], y: [Scalar2; LANES]| {
         proptest::prop_assume!(check(x, y));
         let result_1: [ScalarResult; LANES] = fv(x.into(), y.into()).into();
-        let result_2: [ScalarResult; LANES] = {
-            let mut result = [ScalarResult::default(); LANES];
-            for ((i1, i2), o) in x.iter().zip(y.iter()).zip(result.iter_mut()) {
-                *o = fs(*i1, *i2);
-            }
-            result
-        };
+        let result_2: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .zip(y.iter().copied())
+            .map(|(x, y)| fs(x, y))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
         crate::prop_assert_biteq!(result_1, result_2);
         Ok(())
     });

From 6b3c599ba29e46fd7011cf1f01ec6c4cfda395cf Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 23 Jun 2022 01:40:51 -0400
Subject: [PATCH 18/70] Add missing safety comment

---
 crates/core_simd/src/vector.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 145394a519d12..2fc090254d74f 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -741,11 +741,15 @@ unsafe impl SimdElement for f64 {
 }
 
 impl<T> Sealed for *const T {}
+
+// Safety: const pointers are valid SIMD element types, and are supported by this API
 unsafe impl<T> SimdElement for *const T {
     type Mask = isize;
 }
 
 impl<T> Sealed for *mut T {}
+
+// Safety: mut pointers are valid SIMD element types, and are supported by this API
 unsafe impl<T> SimdElement for *mut T {
     type Mask = isize;
 }

From f10e591de1d321b57af68502a78eef6f8f80c05c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 24 Jun 2022 00:13:36 -0400
Subject: [PATCH 19/70] Fix wrapping pointer arithmetic

---
 crates/core_simd/src/elements/const_ptr.rs | 33 ++++++++++++++++++----
 crates/core_simd/src/elements/mut_ptr.rs   | 33 ++++++++++++++++++----
 crates/core_simd/src/intrinsics.rs         |  3 ++
 crates/core_simd/tests/pointers.rs         | 16 +++++++++++
 4 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index c4a254f5ab1fa..d10bd1481d06d 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -1,11 +1,14 @@
 use super::sealed::Sealed;
-use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
-    /// Vector of usize with the same number of lanes.
+    /// Vector of `usize` with the same number of lanes.
     type Usize;
 
+    /// Vector of `isize` with the same number of lanes.
+    type Isize;
+
     /// Vector of mutable pointers to the same type.
     type MutPtr;
 
@@ -23,10 +26,20 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each lane.
     fn addr(self) -> Self::Usize;
 
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    fn wrapping_offset(self, offset: Self::Isize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
     fn wrapping_add(self, count: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 
 impl<T, const LANES: usize> Sealed for Simd<*const T, LANES> where
@@ -39,6 +52,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     type Usize = Simd<usize, LANES>;
+    type Isize = Simd<isize, LANES>;
     type MutPtr = Simd<*mut T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -57,10 +71,19 @@ where
         self.cast()
     }
 
+    #[inline]
+    fn wrapping_offset(self, count: Self::Isize) -> Self {
+        // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
+        unsafe { intrinsics::simd_arith_offset(self, count) }
+    }
+
     #[inline]
     fn wrapping_add(self, count: Self::Usize) -> Self {
-        let addr = self.addr() + (count * Simd::splat(core::mem::size_of::<T>()));
-        // Safety: transmuting usize to pointers is safe, even if accessing those pointers isn't.
-        unsafe { core::mem::transmute_copy(&addr) }
+        self.wrapping_offset(count.cast())
+    }
+
+    #[inline]
+    fn wrapping_sub(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(-count.cast::<isize>())
     }
 }
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 5920960c49cea..4fc6202e14ef0 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -1,11 +1,14 @@
 use super::sealed::Sealed;
-use crate::simd::{LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
-    /// Vector of usize with the same number of lanes.
+    /// Vector of `usize` with the same number of lanes.
     type Usize;
 
+    /// Vector of `isize` with the same number of lanes.
+    type Isize;
+
     /// Vector of constant pointers to the same type.
     type ConstPtr;
 
@@ -23,10 +26,20 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each lane.
     fn addr(self) -> Self::Usize;
 
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    fn wrapping_offset(self, offset: Self::Isize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
     fn wrapping_add(self, count: Self::Usize) -> Self;
+
+    /// Calculates the offset from a pointer using wrapping arithmetic.
+    ///
+    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 
 impl<T, const LANES: usize> Sealed for Simd<*mut T, LANES> where LaneCount<LANES>: SupportedLaneCount
@@ -37,6 +50,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
 {
     type Usize = Simd<usize, LANES>;
+    type Isize = Simd<isize, LANES>;
     type ConstPtr = Simd<*const T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -55,10 +69,19 @@ where
         self.cast()
     }
 
+    #[inline]
+    fn wrapping_offset(self, count: Self::Isize) -> Self {
+        // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
+        unsafe { intrinsics::simd_arith_offset(self, count) }
+    }
+
     #[inline]
     fn wrapping_add(self, count: Self::Usize) -> Self {
-        let addr = self.addr() + (count * Simd::splat(core::mem::size_of::<T>()));
-        // Safety: transmuting usize to pointers is safe, even if accessing those pointers isn't.
-        unsafe { core::mem::transmute_copy(&addr) }
+        self.wrapping_offset(count.cast())
+    }
+
+    #[inline]
+    fn wrapping_sub(self, count: Self::Usize) -> Self {
+        self.wrapping_offset(-count.cast::<isize>())
     }
 }
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 6047890a09393..41128cd148196 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -151,4 +151,7 @@ extern "platform-intrinsic" {
     pub(crate) fn simd_select<M, T>(m: M, yes: T, no: T) -> T;
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
+
+    // equivalent to wrapping_offset
+    pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
 }
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index df26c462f93d0..2c20362119ef6 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -21,6 +21,14 @@ macro_rules! common_tests {
                 );
             }
 
+            fn wrapping_offset<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness (), LANES>::wrapping_offset,
+                    &<*$constness ()>::wrapping_offset,
+                    &|_, _| true,
+                );
+            }
+
             fn wrapping_add<const LANES: usize>() {
                 test_helpers::test_binary_elementwise(
                     &Simd::<*$constness (), LANES>::wrapping_add,
@@ -28,6 +36,14 @@ macro_rules! common_tests {
                     &|_, _| true,
                 );
             }
+
+            fn wrapping_sub<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness (), LANES>::wrapping_sub,
+                    &<*$constness ()>::wrapping_sub,
+                    &|_, _| true,
+                );
+            }
         }
     }
 }

From da25087f790e9c15fcf633a39e5de307608c9251 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 24 Jun 2022 01:26:24 -0400
Subject: [PATCH 20/70] Test a more useful pointer

---
 crates/core_simd/tests/pointers.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 2c20362119ef6..8eb0bd84042bd 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -7,40 +7,40 @@ macro_rules! common_tests {
         test_helpers::test_lanes! {
             fn is_null<const LANES: usize>() {
                 test_helpers::test_unary_mask_elementwise(
-                    &Simd::<*$constness (), LANES>::is_null,
-                    &<*$constness ()>::is_null,
+                    &Simd::<*$constness u32, LANES>::is_null,
+                    &<*$constness u32>::is_null,
                     &|_| true,
                 );
             }
 
             fn addr<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
-                    &Simd::<*$constness (), LANES>::addr,
-                    &<*$constness ()>::addr,
+                    &Simd::<*$constness u32, LANES>::addr,
+                    &<*$constness u32>::addr,
                     &|_| true,
                 );
             }
 
             fn wrapping_offset<const LANES: usize>() {
                 test_helpers::test_binary_elementwise(
-                    &Simd::<*$constness (), LANES>::wrapping_offset,
-                    &<*$constness ()>::wrapping_offset,
+                    &Simd::<*$constness u32, LANES>::wrapping_offset,
+                    &<*$constness u32>::wrapping_offset,
                     &|_, _| true,
                 );
             }
 
             fn wrapping_add<const LANES: usize>() {
                 test_helpers::test_binary_elementwise(
-                    &Simd::<*$constness (), LANES>::wrapping_add,
-                    &<*$constness ()>::wrapping_add,
+                    &Simd::<*$constness u32, LANES>::wrapping_add,
+                    &<*$constness u32>::wrapping_add,
                     &|_, _| true,
                 );
             }
 
             fn wrapping_sub<const LANES: usize>() {
                 test_helpers::test_binary_elementwise(
-                    &Simd::<*$constness (), LANES>::wrapping_sub,
-                    &<*$constness ()>::wrapping_sub,
+                    &Simd::<*$constness u32, LANES>::wrapping_sub,
+                    &<*$constness u32>::wrapping_sub,
                     &|_, _| true,
                 );
             }

From e7cc021189f1d18974057d60223bdbb5abd4dc15 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 25 Jun 2022 00:00:20 -0400
Subject: [PATCH 21/70] Fix casts

---
 crates/core_simd/src/cast.rs               | 158 +++++++--------------
 crates/core_simd/src/elements/const_ptr.rs |  33 ++++-
 crates/core_simd/src/elements/mut_ptr.rs   |  30 +++-
 crates/core_simd/src/vector.rs             |   5 +-
 4 files changed, 115 insertions(+), 111 deletions(-)

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index d62d3f6635d53..ddcc786afa445 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,129 +1,79 @@
 use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
 
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
-pub trait SimdCast<Target: SimdElement>: SimdElement {
+pub unsafe trait SimdCast<Target: SimdElement>: SimdElement {
     #[doc(hidden)]
     fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<Target, LANES>
     where
-        LaneCount<LANES>: SupportedLaneCount;
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: implementing this trait indicates that the types are supported by `simd_as`
+        unsafe { intrinsics::simd_as(x) }
+    }
+
+    #[doc(hidden)]
+    unsafe fn cast_unchecked<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<Target, LANES>
+    where
+        LaneCount<LANES>: SupportedLaneCount,
+    {
+        // Safety: implementing this trait indicates that the types are supported by `simd_cast`
+        // The caller is responsible for the conversion invariants.
+        unsafe { intrinsics::simd_cast(x) }
+    }
 }
 
 macro_rules! into_number {
-    { $from:ty, $to:ty } => {
-        impl SimdCast<$to> for $from {
-            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$to, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                // Safety: simd_as can handle numeric conversions
-                unsafe { intrinsics::simd_as(x) }
-            }
-        }
+    { unsafe $from:ty as $to:ty } => {
+        // Safety: casting between numbers is supported by `simd_cast` and `simd_as`
+        unsafe impl SimdCast<$to> for $from {}
     };
-    { $($type:ty),* } => {
+    { unsafe $($type:ty),* } => {
         $(
-        into_number! { $type, i8 }
-        into_number! { $type, i16 }
-        into_number! { $type, i32 }
-        into_number! { $type, i64 }
-        into_number! { $type, isize }
+        into_number! { unsafe $type as i8 }
+        into_number! { unsafe $type as i16 }
+        into_number! { unsafe $type as i32 }
+        into_number! { unsafe $type as i64 }
+        into_number! { unsafe $type as isize }
 
-        into_number! { $type, u8 }
-        into_number! { $type, u16 }
-        into_number! { $type, u32 }
-        into_number! { $type, u64 }
-        into_number! { $type, usize }
+        into_number! { unsafe $type as u8 }
+        into_number! { unsafe $type as u16 }
+        into_number! { unsafe $type as u32 }
+        into_number! { unsafe $type as u64 }
+        into_number! { unsafe $type as usize }
 
-        into_number! { $type, f32 }
-        into_number! { $type, f64 }
+        into_number! { unsafe $type as f32 }
+        into_number! { unsafe $type as f64 }
         )*
     }
 }
 
-into_number! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, f32, f64 }
+into_number! { unsafe i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, f32, f64 }
 
+// TODO uncomment pending PR to rustc
+/*
 macro_rules! into_pointer {
-    { $($type:ty),* } => {
+    { unsafe $($type:ty),* } => {
         $(
-        impl<T> SimdCast<$type> for *const T {
-            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$type, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                // Safety: transmuting isize to pointers is safe
-                let x: Simd<isize, LANES> = unsafe { core::mem::transmute_copy(&x) };
-                x.cast()
-            }
-        }
-        impl<T> SimdCast<$type> for *mut T {
-            fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<$type, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                // Safety: transmuting isize to pointers is safe
-                let x: Simd<isize, LANES> = unsafe { core::mem::transmute_copy(&x) };
-                x.cast()
-            }
-        }
-        impl<T> SimdCast<*const T> for $type {
-            fn cast<const LANES: usize>(x: Simd<$type, LANES>) -> Simd<*const T, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                let x: Simd<isize, LANES> = x.cast();
-                // Safety: transmuting isize to pointers is safe
-                unsafe { core::mem::transmute_copy(&x) }
-            }
-        }
-        impl<T> SimdCast<*mut T> for $type {
-            fn cast<const LANES: usize>(x: Simd<$type, LANES>) -> Simd<*mut T, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-            {
-                let x: Simd<isize, LANES> = x.cast();
-                // Safety: transmuting isize to pointers is safe
-                unsafe { core::mem::transmute_copy(&x) }
-            }
-        }
+        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
+        unsafe impl<T> SimdCast<$type> for *const T {}
+        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
+        unsafe impl<T> SimdCast<$type> for *mut T {}
+        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
+        unsafe impl<T> SimdCast<*const T> for $type {}
+        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
+        unsafe impl<T> SimdCast<*mut T> for $type {}
         )*
     }
 }
 
-into_pointer! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
+into_pointer! { unsafe i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
 
-impl<T, U> SimdCast<*const T> for *const U {
-    fn cast<const LANES: usize>(x: Simd<*const U, LANES>) -> Simd<*const T, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: transmuting pointers is safe
-        unsafe { core::mem::transmute_copy(&x) }
-    }
-}
-impl<T, U> SimdCast<*const T> for *mut U {
-    fn cast<const LANES: usize>(x: Simd<*mut U, LANES>) -> Simd<*const T, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: transmuting pointers is safe
-        unsafe { core::mem::transmute_copy(&x) }
-    }
-}
-impl<T, U> SimdCast<*mut T> for *const U {
-    fn cast<const LANES: usize>(x: Simd<*const U, LANES>) -> Simd<*mut T, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: transmuting pointers is safe
-        unsafe { core::mem::transmute_copy(&x) }
-    }
-}
-impl<T, U> SimdCast<*mut T> for *mut U {
-    fn cast<const LANES: usize>(x: Simd<*mut U, LANES>) -> Simd<*mut T, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: transmuting pointers is safe
-        unsafe { core::mem::transmute_copy(&x) }
-    }
-}
+// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
+unsafe impl<T, U> SimdCast<*const T> for *const U {}
+// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
+unsafe impl<T, U> SimdCast<*const T> for *mut U {}
+// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
+unsafe impl<T, U> SimdCast<*mut T> for *const U {}
+// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
+unsafe impl<T, U> SimdCast<*mut T> for *mut U {}
+*/
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index d10bd1481d06d..5a5faad23c81d 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -23,9 +23,23 @@ pub trait SimdConstPtr: Copy + Sealed {
 
     /// Gets the "address" portion of the pointer.
     ///
+    /// This method discards pointer semantic metadata, so the result cannot be
+    /// directly cast into a valid pointer.
+    ///
+    /// This method semantically discards *provenance* and
+    /// *address-space* information. To properly restore that information, use [`with_addr`].
+    ///
     /// Equivalent to calling [`pointer::addr`] on each lane.
     fn addr(self) -> Self::Usize;
 
+    /// Creates a new pointer with the given address.
+    ///
+    /// This performs the same operation as a cast, but copies the *address-space* and
+    /// *provenance* of `self` to the new pointer.
+    ///
+    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    fn with_addr(self, addr: Self::Usize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
@@ -63,12 +77,27 @@ where
 
     #[inline]
     fn as_mut(self) -> Self::MutPtr {
-        self.cast()
+        unimplemented!()
+        //self.cast()
     }
 
     #[inline]
     fn addr(self) -> Self::Usize {
-        self.cast()
+        // Safety: Since `addr` discards provenance, this is safe.
+        unsafe { core::mem::transmute_copy(&self) }
+
+        //TODO switch to casts when available
+        //self.cast()
+    }
+
+    #[inline]
+    fn with_addr(self, addr: Self::Usize) -> Self {
+        unimplemented!()
+        /*
+        self.cast::<*const u8>()
+            .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
+            .cast()
+        */
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 4fc6202e14ef0..d7b05af0eac5d 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -23,9 +23,20 @@ pub trait SimdMutPtr: Copy + Sealed {
 
     /// Gets the "address" portion of the pointer.
     ///
+    /// This method discards pointer semantic metadata, so the result cannot be
+    /// directly cast into a valid pointer.
+    ///
     /// Equivalent to calling [`pointer::addr`] on each lane.
     fn addr(self) -> Self::Usize;
 
+    /// Creates a new pointer with the given address.
+    ///
+    /// This performs the same operation as a cast, but copies the *address-space* and
+    /// *provenance* of `self` to the new pointer.
+    ///
+    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    fn with_addr(self, addr: Self::Usize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
@@ -61,12 +72,27 @@ where
 
     #[inline]
     fn as_const(self) -> Self::ConstPtr {
-        self.cast()
+        unimplemented!()
+        //self.cast()
     }
 
     #[inline]
     fn addr(self) -> Self::Usize {
-        self.cast()
+        // Safety: Since `addr` discards provenance, this is safe.
+        unsafe { core::mem::transmute_copy(&self) }
+
+        //TODO switch to casts when available
+        //self.cast()
+    }
+
+    #[inline]
+    fn with_addr(self, addr: Self::Usize) -> Self {
+        unimplemented!()
+        /*
+        self.cast::<*mut u8>()
+            .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
+            .cast()
+        */
     }
 
     #[inline]
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 2fc090254d74f..3987b7a747b65 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -237,9 +237,8 @@ where
         T: core::convert::FloatToInt<I> + SimdCast<I>,
         I: SimdElement,
     {
-        // Safety: `self` is a vector, and `FloatToInt` ensures the type can be casted to
-        // an integer.
-        unsafe { intrinsics::simd_cast(self) }
+        // Safety: the caller is responsible for the invariants
+        unsafe { SimdCast::cast_unchecked(self) }
     }
 
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.

From 8a5a5732a1527fbdffbc825ae630d911fc130e2e Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 26 Jun 2022 10:07:48 -0400
Subject: [PATCH 22/70] Clarify addr and with_addr implementations

---
 crates/core_simd/src/elements/const_ptr.rs | 14 +++++++++-----
 crates/core_simd/src/elements/mut_ptr.rs   | 14 +++++++++-----
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 5a5faad23c81d..3485d31e44dff 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -83,17 +83,21 @@ where
 
     #[inline]
     fn addr(self) -> Self::Usize {
-        // Safety: Since `addr` discards provenance, this is safe.
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Pointer-to-integer transmutes are valid (if you are okay with losing the
+        // provenance).
         unsafe { core::mem::transmute_copy(&self) }
-
-        //TODO switch to casts when available
-        //self.cast()
     }
 
     #[inline]
-    fn with_addr(self, addr: Self::Usize) -> Self {
+    fn with_addr(self, _addr: Self::Usize) -> Self {
         unimplemented!()
         /*
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        //
+        // In the mean-time, this operation is defined to be "as if" it was
+        // a wrapping_offset, so we can emulate it as such. This should properly
+        // restore pointer provenance even under today's compiler.
         self.cast::<*const u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
             .cast()
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index d7b05af0eac5d..39fe9f3562119 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -78,17 +78,21 @@ where
 
     #[inline]
     fn addr(self) -> Self::Usize {
-        // Safety: Since `addr` discards provenance, this is safe.
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Pointer-to-integer transmutes are valid (if you are okay with losing the
+        // provenance).
         unsafe { core::mem::transmute_copy(&self) }
-
-        //TODO switch to casts when available
-        //self.cast()
     }
 
     #[inline]
-    fn with_addr(self, addr: Self::Usize) -> Self {
+    fn with_addr(self, _addr: Self::Usize) -> Self {
         unimplemented!()
         /*
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        //
+        // In the mean-time, this operation is defined to be "as if" it was
+        // a wrapping_offset, so we can emulate it as such. This should properly
+        // restore pointer provenance even under today's compiler.
         self.cast::<*mut u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
             .cast()

From 176cc81324d008bd58e28136aa8e60b537caa3ce Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 4 Aug 2022 19:31:50 -0400
Subject: [PATCH 23/70] Update for new intrinsics

---
 crates/core_simd/src/cast.rs               | 98 +++++-----------------
 crates/core_simd/src/elements/const_ptr.rs | 31 +++++--
 crates/core_simd/src/elements/mut_ptr.rs   | 28 +++++--
 crates/core_simd/src/intrinsics.rs         | 23 +++++
 crates/core_simd/src/vector.rs             | 30 +++++--
 5 files changed, 110 insertions(+), 100 deletions(-)

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index ddcc786afa445..d14b0de5d5e7e 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,79 +1,23 @@
-use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::SimdElement;
 
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
-pub unsafe trait SimdCast<Target: SimdElement>: SimdElement {
-    #[doc(hidden)]
-    fn cast<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<Target, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: implementing this trait indicates that the types are supported by `simd_as`
-        unsafe { intrinsics::simd_as(x) }
-    }
-
-    #[doc(hidden)]
-    unsafe fn cast_unchecked<const LANES: usize>(x: Simd<Self, LANES>) -> Simd<Target, LANES>
-    where
-        LaneCount<LANES>: SupportedLaneCount,
-    {
-        // Safety: implementing this trait indicates that the types are supported by `simd_cast`
-        // The caller is responsible for the conversion invariants.
-        unsafe { intrinsics::simd_cast(x) }
-    }
-}
-
-macro_rules! into_number {
-    { unsafe $from:ty as $to:ty } => {
-        // Safety: casting between numbers is supported by `simd_cast` and `simd_as`
-        unsafe impl SimdCast<$to> for $from {}
-    };
-    { unsafe $($type:ty),* } => {
-        $(
-        into_number! { unsafe $type as i8 }
-        into_number! { unsafe $type as i16 }
-        into_number! { unsafe $type as i32 }
-        into_number! { unsafe $type as i64 }
-        into_number! { unsafe $type as isize }
-
-        into_number! { unsafe $type as u8 }
-        into_number! { unsafe $type as u16 }
-        into_number! { unsafe $type as u32 }
-        into_number! { unsafe $type as u64 }
-        into_number! { unsafe $type as usize }
-
-        into_number! { unsafe $type as f32 }
-        into_number! { unsafe $type as f64 }
-        )*
-    }
-}
-
-into_number! { unsafe i8, i16, i32, i64, isize, u8, u16, u32, u64, usize, f32, f64 }
-
-// TODO uncomment pending PR to rustc
-/*
-macro_rules! into_pointer {
-    { unsafe $($type:ty),* } => {
-        $(
-        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
-        unsafe impl<T> SimdCast<$type> for *const T {}
-        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
-        unsafe impl<T> SimdCast<$type> for *mut T {}
-        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
-        unsafe impl<T> SimdCast<*const T> for $type {}
-        // Safety: casting between numbers and pointers is supported by `simd_cast` and `simd_as`
-        unsafe impl<T> SimdCast<*mut T> for $type {}
-        )*
-    }
-}
-
-into_pointer! { unsafe i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
-
-// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
-unsafe impl<T, U> SimdCast<*const T> for *const U {}
-// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
-unsafe impl<T, U> SimdCast<*const T> for *mut U {}
-// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
-unsafe impl<T, U> SimdCast<*mut T> for *const U {}
-// Safety: casting between pointers is supported by `simd_cast` and `simd_as`
-unsafe impl<T, U> SimdCast<*mut T> for *mut U {}
-*/
+pub unsafe trait SimdCast: SimdElement {}
+
+unsafe impl SimdCast for i8 {}
+unsafe impl SimdCast for i16 {}
+unsafe impl SimdCast for i32 {}
+unsafe impl SimdCast for i64 {}
+unsafe impl SimdCast for isize {}
+unsafe impl SimdCast for u8 {}
+unsafe impl SimdCast for u16 {}
+unsafe impl SimdCast for u32 {}
+unsafe impl SimdCast for u64 {}
+unsafe impl SimdCast for usize {}
+unsafe impl SimdCast for f32 {}
+unsafe impl SimdCast for f64 {}
+
+/// Supporting trait for `Simd::cast_ptr`.  Typically doesn't need to be used directly.
+pub unsafe trait SimdCastPtr: SimdElement {}
+
+unsafe impl<T> SimdCastPtr for *const T {}
+unsafe impl<T> SimdCastPtr for *mut T {}
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 3485d31e44dff..27b41019dc8cf 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -40,6 +40,15 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each lane.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
+    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
+    /// in [`from_exposed_addr`].
+    fn expose_addr(self) -> Self::Usize;
+
+    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    ///
+    /// Equivalent to calling [`pointer::from_exposed_addr`] on each lane.
+    fn from_exposed_addr(addr: Self::Usize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
@@ -77,8 +86,7 @@ where
 
     #[inline]
     fn as_mut(self) -> Self::MutPtr {
-        unimplemented!()
-        //self.cast()
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -90,18 +98,25 @@ where
     }
 
     #[inline]
-    fn with_addr(self, _addr: Self::Usize) -> Self {
-        unimplemented!()
-        /*
+    fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
         //
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast::<*const u8>()
+        self.cast_ptr::<*const u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast()
-        */
+            .cast_ptr()
+    }
+
+    #[inline]
+    fn expose_addr(self) -> Self::Usize {
+        unsafe { intrinsics::simd_expose_addr(self) }
+    }
+
+    #[inline]
+    fn from_exposed_addr(addr: Self::Usize) -> Self {
+        unsafe { intrinsics::simd_from_exposed_addr(addr) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 39fe9f3562119..59a8b6293b5d8 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -37,6 +37,15 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each lane.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
+    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
+    /// in [`from_exposed_addr`].
+    fn expose_addr(self) -> Self::Usize;
+
+    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    ///
+    /// Equivalent to calling [`pointer::from_exposed_addr`] on each lane.
+    fn from_exposed_addr(addr: Self::Usize) -> Self;
+
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
     /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
@@ -85,18 +94,25 @@ where
     }
 
     #[inline]
-    fn with_addr(self, _addr: Self::Usize) -> Self {
-        unimplemented!()
-        /*
+    fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
         //
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast::<*mut u8>()
+        self.cast_ptr::<*mut u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast()
-        */
+            .cast_ptr()
+    }
+
+    #[inline]
+    fn expose_addr(self) -> Self::Usize {
+        unsafe { intrinsics::simd_expose_addr(self) }
+    }
+
+    #[inline]
+    fn from_exposed_addr(addr: Self::Usize) -> Self {
+        unsafe { intrinsics::simd_from_exposed_addr(addr) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 41128cd148196..c0fbae2db0841 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -154,4 +154,27 @@ extern "platform-intrinsic" {
 
     // equivalent to wrapping_offset
     pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
+
+    /*
+    /// equivalent to `T as U` semantics, specifically for pointers
+    pub(crate) fn simd_cast_ptr<T, U>(ptr: T) -> U;
+
+    /// expose a pointer as an address
+    pub(crate) fn simd_expose_addr<T, U>(ptr: T) -> U;
+
+    /// convert an exposed address back to a pointer
+    pub(crate) fn simd_from_exposed_addr<T, U>(addr: T) -> U;
+    */
+}
+
+pub(crate) unsafe fn simd_cast_ptr<T, U>(_ptr: T) -> U {
+    unimplemented!()
+}
+
+pub(crate) unsafe fn simd_expose_addr<T, U>(_ptr: T) -> U {
+    unimplemented!()
+}
+
+pub(crate) unsafe fn simd_from_exposed_addr<T, U>(_addr: T) -> U {
+    unimplemented!()
 }
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3987b7a747b65..3c435c4c80593 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,6 +1,6 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdConstPtr, SimdMutPtr, SimdPartialOrd,
-    SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdCastPtr, SimdConstPtr, SimdMutPtr,
+    SimdPartialOrd, SupportedLaneCount, Swizzle,
 };
 
 /// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
@@ -209,11 +209,23 @@ where
     #[must_use]
     #[inline]
     #[cfg(not(bootstrap))]
-    pub fn cast<U: SimdElement>(self) -> Simd<U, LANES>
+    pub fn cast<U: SimdCast>(self) -> Simd<U, LANES>
     where
-        T: SimdCast<U>,
+        T: SimdCast,
     {
-        SimdCast::cast(self)
+        // Safety: supported types are guaranteed by SimdCast
+        unsafe { intrinsics::simd_as(self) }
+    }
+
+    /// Lanewise casts pointers to another pointer type.
+    #[must_use]
+    #[inline]
+    pub fn cast_ptr<U: SimdCastPtr>(self) -> Simd<U, LANES>
+    where
+        T: SimdCastPtr,
+    {
+        // Safety: supported types are guaranteed by SimdCastPtr
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     /// Rounds toward zero and converts to the same-width integer type, assuming that
@@ -234,11 +246,11 @@ where
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
     where
-        T: core::convert::FloatToInt<I> + SimdCast<I>,
-        I: SimdElement,
+        T: core::convert::FloatToInt<I> + SimdCast,
+        I: SimdCast,
     {
-        // Safety: the caller is responsible for the invariants
-        unsafe { SimdCast::cast_unchecked(self) }
+        // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
+        unsafe { intrinsics::simd_cast(self) }
     }
 
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.

From dadf98a290e4f52d02a469f97931b90e953a98cf Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 4 Aug 2022 19:38:56 -0400
Subject: [PATCH 24/70] Remove duplicate intrinsic

---
 crates/core_simd/src/intrinsics.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index c0fbae2db0841..45f01fa0f77b0 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -61,9 +61,6 @@ extern "platform-intrinsic" {
     /// xor
     pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
 
-    /// getelementptr (without inbounds)
-    pub(crate) fn simd_arith_offset<T, U>(ptrs: T, offsets: U) -> T;
-
     /// fptoui/fptosi/uitofp/sitofp
     /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
     /// but the truncated value must fit in the target type or the result is poison.
@@ -152,7 +149,8 @@ extern "platform-intrinsic" {
     #[allow(unused)]
     pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
 
-    // equivalent to wrapping_offset
+    /// getelementptr (without inbounds)
+    /// equivalent to wrapping_offset
     pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
 
     /*

From e5db1ecc8209e90982cc4603514028ef2210e592 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 4 Aug 2022 19:46:39 -0400
Subject: [PATCH 25/70] Fix documentation

---
 crates/core_simd/src/elements/const_ptr.rs | 6 +++---
 crates/core_simd/src/elements/mut_ptr.rs   | 4 ++--
 crates/core_simd/src/lib.rs                | 3 ++-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 27b41019dc8cf..0a3d4ec40870d 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -27,7 +27,7 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// directly cast into a valid pointer.
     ///
     /// This method semantically discards *provenance* and
-    /// *address-space* information. To properly restore that information, use [`with_addr`].
+    /// *address-space* information. To properly restore that information, use [`Self::with_addr`].
     ///
     /// Equivalent to calling [`pointer::addr`] on each lane.
     fn addr(self) -> Self::Usize;
@@ -41,12 +41,12 @@ pub trait SimdConstPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`from_exposed_addr`].
+    /// in [`Self::from_exposed_addr`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`pointer::from_exposed_addr`] on each lane.
+    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each lane.
     fn from_exposed_addr(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 59a8b6293b5d8..e6aa9808f3732 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -38,12 +38,12 @@ pub trait SimdMutPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`from_exposed_addr`].
+    /// in [`Self::from_exposed_addr`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`pointer::from_exposed_addr`] on each lane.
+    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each lane.
     fn from_exposed_addr(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 715f258f617df..05ac3e9338b86 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -7,7 +7,8 @@
     repr_simd,
     simd_ffi,
     staged_api,
-    stdsimd
+    stdsimd,
+    strict_provenance
 )]
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]

From 0fcc4069c12a4cffa69397388a0be42d45afdd49 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 4 Aug 2022 20:17:16 -0400
Subject: [PATCH 26/70] Fix pointer mutability casts and safety lints

---
 crates/core_simd/src/cast.rs               | 22 ++++++++++++++++++++++
 crates/core_simd/src/elements/const_ptr.rs |  4 +++-
 crates/core_simd/src/elements/mut_ptr.rs   |  5 +++--
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index d14b0de5d5e7e..33878581e0b5f 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,23 +1,45 @@
 use crate::simd::SimdElement;
 
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
+///
+/// # Safety
+/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast` or
+/// `simd_as` intrinsics.
 pub unsafe trait SimdCast: SimdElement {}
 
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for i8 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for i16 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for i32 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for i64 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for isize {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for u8 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for u16 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for u32 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for u64 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for usize {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for f32 {}
+// Safety: primitive number types can be cast to other primitive number types
 unsafe impl SimdCast for f64 {}
 
 /// Supporting trait for `Simd::cast_ptr`.  Typically doesn't need to be used directly.
+///
+/// # Safety
+/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast_ptr`
+/// intrinsic.
 pub unsafe trait SimdCastPtr: SimdElement {}
 
+// Safety: pointers can be cast to other pointer types
 unsafe impl<T> SimdCastPtr for *const T {}
+// Safety: pointers can be cast to other pointer types
 unsafe impl<T> SimdCastPtr for *mut T {}
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 0a3d4ec40870d..7c856fd4332b8 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -86,7 +86,7 @@ where
 
     #[inline]
     fn as_mut(self) -> Self::MutPtr {
-        unsafe { intrinsics::simd_cast_ptr(self) }
+        self.cast_ptr()
     }
 
     #[inline]
@@ -111,11 +111,13 @@ where
 
     #[inline]
     fn expose_addr(self) -> Self::Usize {
+        // Safety: `self` is a pointer vector
         unsafe { intrinsics::simd_expose_addr(self) }
     }
 
     #[inline]
     fn from_exposed_addr(addr: Self::Usize) -> Self {
+        // Safety: `self` is a pointer vector
         unsafe { intrinsics::simd_from_exposed_addr(addr) }
     }
 
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index e6aa9808f3732..5e904d24a42e3 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -81,8 +81,7 @@ where
 
     #[inline]
     fn as_const(self) -> Self::ConstPtr {
-        unimplemented!()
-        //self.cast()
+        self.cast_ptr()
     }
 
     #[inline]
@@ -107,11 +106,13 @@ where
 
     #[inline]
     fn expose_addr(self) -> Self::Usize {
+        // Safety: `self` is a pointer vector
         unsafe { intrinsics::simd_expose_addr(self) }
     }
 
     #[inline]
     fn from_exposed_addr(addr: Self::Usize) -> Self {
+        // Safety: `self` is a pointer vector
         unsafe { intrinsics::simd_from_exposed_addr(addr) }
     }
 

From a79718ffa8cdfb5ee7ab3d9281b162fe37eb0606 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 18 Sep 2022 16:48:51 -0400
Subject: [PATCH 27/70] Use new intrinsics

---
 crates/core_simd/src/intrinsics.rs | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index 45f01fa0f77b0..d5466822b93c6 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -153,7 +153,6 @@ extern "platform-intrinsic" {
     /// equivalent to wrapping_offset
     pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
 
-    /*
     /// equivalent to `T as U` semantics, specifically for pointers
     pub(crate) fn simd_cast_ptr<T, U>(ptr: T) -> U;
 
@@ -162,17 +161,4 @@ extern "platform-intrinsic" {
 
     /// convert an exposed address back to a pointer
     pub(crate) fn simd_from_exposed_addr<T, U>(addr: T) -> U;
-    */
-}
-
-pub(crate) unsafe fn simd_cast_ptr<T, U>(_ptr: T) -> U {
-    unimplemented!()
-}
-
-pub(crate) unsafe fn simd_expose_addr<T, U>(_ptr: T) -> U {
-    unimplemented!()
-}
-
-pub(crate) unsafe fn simd_from_exposed_addr<T, U>(_addr: T) -> U {
-    unimplemented!()
 }

From 078cb58e766c20a2705f22f7a6f9bc0cf451e16d Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 18 Sep 2022 22:47:34 -0400
Subject: [PATCH 28/70] Apply suggestions from code review

Co-authored-by: Jacob Lifshay <programmerjake@gmail.com>
---
 crates/core_simd/src/elements/const_ptr.rs | 2 +-
 crates/core_simd/src/elements/mut_ptr.rs   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 7c856fd4332b8..f7227a56d58dc 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -61,7 +61,7 @@ pub trait SimdConstPtr: Copy + Sealed {
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
     fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index 5e904d24a42e3..e2fd438ef8fb7 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -58,7 +58,7 @@ pub trait SimdMutPtr: Copy + Sealed {
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
     fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 

From 469c620bded61d265ef020b2442b1f639b2d8c10 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 21 Oct 2022 21:43:48 -0400
Subject: [PATCH 29/70] Account for pointer metadata in pointer bounds

---
 crates/core_simd/src/cast.rs   | 16 +++++++++++++---
 crates/core_simd/src/lib.rs    |  3 ++-
 crates/core_simd/src/vector.rs | 23 +++++++++++++++++------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index 33878581e0b5f..65a3f845ffca7 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -37,9 +37,19 @@ unsafe impl SimdCast for f64 {}
 /// # Safety
 /// Implementing this trait asserts that the type is a valid vector element for the `simd_cast_ptr`
 /// intrinsic.
-pub unsafe trait SimdCastPtr: SimdElement {}
+pub unsafe trait SimdCastPtr<T> {}
 
 // Safety: pointers can be cast to other pointer types
-unsafe impl<T> SimdCastPtr for *const T {}
+unsafe impl<T, U> SimdCastPtr<T> for *const U
+where
+    U: core::ptr::Pointee,
+    T: core::ptr::Pointee<Metadata = U::Metadata>,
+{
+}
 // Safety: pointers can be cast to other pointer types
-unsafe impl<T> SimdCastPtr for *mut T {}
+unsafe impl<T, U> SimdCastPtr<T> for *mut U
+where
+    U: core::ptr::Pointee,
+    T: core::ptr::Pointee<Metadata = U::Metadata>,
+{
+}
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 05ac3e9338b86..828731629692c 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -8,7 +8,8 @@
     simd_ffi,
     staged_api,
     stdsimd,
-    strict_provenance
+    strict_provenance,
+    ptr_metadata
 )]
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3c435c4c80593..c5d68f1b921f2 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -220,9 +220,10 @@ where
     /// Lanewise casts pointers to another pointer type.
     #[must_use]
     #[inline]
-    pub fn cast_ptr<U: SimdCastPtr>(self) -> Simd<U, LANES>
+    pub fn cast_ptr<U>(self) -> Simd<U, LANES>
     where
-        T: SimdCastPtr,
+        T: SimdCastPtr<U>,
+        U: SimdElement,
     {
         // Safety: supported types are guaranteed by SimdCastPtr
         unsafe { intrinsics::simd_cast_ptr(self) }
@@ -753,14 +754,24 @@ unsafe impl SimdElement for f64 {
 
 impl<T> Sealed for *const T {}
 
-// Safety: const pointers are valid SIMD element types, and are supported by this API
-unsafe impl<T> SimdElement for *const T {
+// Safety: (thin) const pointers are valid SIMD element types, and are supported by this API
+//
+// Fat pointers may be supported in the future.
+unsafe impl<T> SimdElement for *const T
+where
+    T: core::ptr::Pointee<Metadata = ()>,
+{
     type Mask = isize;
 }
 
 impl<T> Sealed for *mut T {}
 
-// Safety: mut pointers are valid SIMD element types, and are supported by this API
-unsafe impl<T> SimdElement for *mut T {
+// Safety: (thin) mut pointers are valid SIMD element types, and are supported by this API
+//
+// Fat pointers may be supported in the future.
+unsafe impl<T> SimdElement for *mut T
+where
+    T: core::ptr::Pointee<Metadata = ()>,
+{
     type Mask = isize;
 }

From de30820035cb42d05f49575811a9f33661985e67 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 29 Oct 2022 21:39:08 -0400
Subject: [PATCH 30/70] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 80313157ea2c9..e8ac600debe67 100644
--- a/README.md
+++ b/README.md
@@ -47,9 +47,10 @@ The supported element types are as follows:
 * **Floating Point:** `f32`, `f64`
 * **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `isize` (`i128` excluded)
 * **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `usize` (`u128` excluded)
+* **Pointers:** `*const T` and `*mut T` (zero-sized metadata only)
 * **Masks:** 8-bit, 16-bit, 32-bit, 64-bit, and `usize`-sized masks
 
-Floating point, signed integers, and unsigned integers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
+Floating point, signed integers, unsigned integers, and pointers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
 The mask types have elements that are "truthy" values, like `bool`, but have an unspecified layout because different architectures prefer different layouts for mask types.
 
 [simd-guide]: ./beginners-guide.md

From 572122a95da6f8aaf513f53c426732f4c0a91325 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 9 Nov 2022 21:28:38 -0500
Subject: [PATCH 31/70] Add missing pointer tests and rename pointer cast fns
 to match scalars

---
 crates/core_simd/src/elements/const_ptr.rs |  6 ++-
 crates/core_simd/src/elements/mut_ptr.rs   |  6 ++-
 crates/core_simd/tests/pointers.rs         | 52 ++++++++++++++++++++++
 3 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index f7227a56d58dc..0ef9802b5e219 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -19,7 +19,9 @@ pub trait SimdConstPtr: Copy + Sealed {
     fn is_null(self) -> Self::Mask;
 
     /// Changes constness without changing the type.
-    fn as_mut(self) -> Self::MutPtr;
+    ///
+    /// Equivalent to calling [`pointer::cast_mut`] on each lane.
+    fn cast_mut(self) -> Self::MutPtr;
 
     /// Gets the "address" portion of the pointer.
     ///
@@ -85,7 +87,7 @@ where
     }
 
     #[inline]
-    fn as_mut(self) -> Self::MutPtr {
+    fn cast_mut(self) -> Self::MutPtr {
         self.cast_ptr()
     }
 
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index e2fd438ef8fb7..d87986b4a091c 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -19,7 +19,9 @@ pub trait SimdMutPtr: Copy + Sealed {
     fn is_null(self) -> Self::Mask;
 
     /// Changes constness without changing the type.
-    fn as_const(self) -> Self::ConstPtr;
+    ///
+    /// Equivalent to calling [`pointer::cast_const`] on each lane.
+    fn cast_const(self) -> Self::ConstPtr;
 
     /// Gets the "address" portion of the pointer.
     ///
@@ -80,7 +82,7 @@ where
     }
 
     #[inline]
-    fn as_const(self) -> Self::ConstPtr {
+    fn cast_const(self) -> Self::ConstPtr {
         self.cast_ptr()
     }
 
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 8eb0bd84042bd..2b0008624ad8a 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -21,6 +21,22 @@ macro_rules! common_tests {
                 );
             }
 
+            fn with_addr<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &Simd::<*$constness u32, LANES>::with_addr,
+                    &<*$constness u32>::with_addr,
+                    &|_, _| true,
+                );
+            }
+
+            fn expose_addr<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &Simd::<*$constness u32, LANES>::expose_addr,
+                    &<*$constness u32>::expose_addr,
+                    &|_| true,
+                );
+            }
+
             fn wrapping_offset<const LANES: usize>() {
                 test_helpers::test_binary_elementwise(
                     &Simd::<*$constness u32, LANES>::wrapping_offset,
@@ -51,9 +67,45 @@ macro_rules! common_tests {
 mod const_ptr {
     use super::*;
     common_tests! { const }
+
+    test_helpers::test_lanes! {
+        fn cast_mut<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*const u32, LANES>::cast_mut,
+                &<*const u32>::cast_mut,
+                &|_| true,
+            );
+        }
+
+        fn from_exposed_addr<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*const u32, LANES>::from_exposed_addr,
+                &core::ptr::from_exposed_addr::<u32>,
+                &|_| true,
+            );
+        }
+    }
 }
 
 mod mut_ptr {
     use super::*;
     common_tests! { mut }
+
+    test_helpers::test_lanes! {
+        fn cast_const<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*mut u32, LANES>::cast_const,
+                &<*mut u32>::cast_const,
+                &|_| true,
+            );
+        }
+
+        fn from_exposed_addr<const LANES: usize>() {
+            test_helpers::test_unary_elementwise(
+                &Simd::<*mut u32, LANES>::from_exposed_addr,
+                &core::ptr::from_exposed_addr_mut::<u32>,
+                &|_| true,
+            );
+        }
+    }
 }

From 7ac1fbbcb14c05f778cf1c550e2b30f00606bb97 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 11 Nov 2022 17:32:48 -0500
Subject: [PATCH 32/70] impl TryFrom<&[T]> for Simd

---
 crates/core_simd/src/vector.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index c5d68f1b921f2..0095ed1648f21 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -650,6 +650,30 @@ where
     }
 }
 
+impl<T, const LANES: usize> TryFrom<&[T]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    type Error = core::array::TryFromSliceError;
+
+    fn try_from(slice: &[T]) -> Result<Self, Self::Error> {
+        Ok(Self::from_array(slice.try_into()?))
+    }
+}
+
+impl<T, const LANES: usize> TryFrom<&mut [T]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    type Error = core::array::TryFromSliceError;
+
+    fn try_from(slice: &mut [T]) -> Result<Self, Self::Error> {
+        Ok(Self::from_array(slice.try_into()?))
+    }
+}
+
 mod sealed {
     pub trait Sealed {}
 }

From 9dc690c48265bae58ca6e307d8f35a1f74b921e3 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 11 Nov 2022 18:10:51 -0500
Subject: [PATCH 33/70] Add TryFrom<&[T]> tests

---
 crates/core_simd/tests/try_from_slice.rs | 25 ++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 crates/core_simd/tests/try_from_slice.rs

diff --git a/crates/core_simd/tests/try_from_slice.rs b/crates/core_simd/tests/try_from_slice.rs
new file mode 100644
index 0000000000000..189c18c6039a0
--- /dev/null
+++ b/crates/core_simd/tests/try_from_slice.rs
@@ -0,0 +1,25 @@
+#![feature(portable_simd)]
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+use core_simd::i32x4;
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn try_from_slice() {
+    // Equal length
+    assert_eq!(
+        i32x4::try_from([1, 2, 3, 4].as_slice()).unwrap(),
+        i32x4::from_array([1, 2, 3, 4])
+    );
+
+    // Slice length > vector length
+    assert!(i32x4::try_from([1, 2, 3, 4, 5].as_slice()).is_err());
+
+    // Slice length < vector length
+    assert!(i32x4::try_from([1, 2, 3].as_slice()).is_err());
+}

From fd53445d05874d7662682b00d81cf073cfdbe505 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 11 Nov 2022 19:48:27 -0500
Subject: [PATCH 34/70] Add pointer scatter/gather

---
 crates/core_simd/src/vector.rs | 68 ++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index c5d68f1b921f2..850a517c7990c 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -364,8 +364,44 @@ where
         let base_ptr = Simd::<*const T, LANES>::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
-        // Safety: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
-        unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
+        // Safety: The caller is responsible for determining the indices are okay to read
+        unsafe { Self::gather_select_ptr(ptrs, enable, or) }
+    }
+
+    /// Read pointers elementwise into a SIMD vector vector.
+    ///
+    /// # Safety
+    ///
+    /// Each read must satisfy the same conditions as [`core::ptr::read`].
+    #[must_use]
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn gather_ptr(source: Simd<*const T, LANES>) -> Self
+    where
+        T: Default,
+    {
+        // TODO: add an intrinsic that doesn't use a passthru vector, and remove the T: Default bound
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { Self::gather_select_ptr(source, Mask::splat(true), Self::default()) }
+    }
+
+    /// Conditionally read pointers elementwise into a SIMD vector vector.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If a lane is disabled, the lane is selected from the `or` vector and no read is performed.
+    ///
+    /// # Safety
+    ///
+    /// Enabled lanes must satisfy the same conditions as [`core::ptr::read`].
+    #[must_use]
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn gather_select_ptr(
+        source: Simd<*const T, LANES>,
+        enable: Mask<isize, LANES>,
+        or: Self,
+    ) -> Self {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { intrinsics::simd_gather(or, source, enable.to_int()) }
     }
 
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
@@ -473,10 +509,36 @@ where
             // Ferris forgive me, I have done pointer arithmetic here.
             let ptrs = base_ptr.wrapping_add(idxs);
             // The ptrs have been bounds-masked to prevent memory-unsafe writes insha'allah
-            intrinsics::simd_scatter(self, ptrs, enable.to_int())
+            self.scatter_select_ptr(ptrs, enable);
             // Cleared ☢️ *mut T Zone
         }
     }
+
+    /// Write pointers elementwise into a SIMD vector vector.
+    ///
+    /// # Safety
+    ///
+    /// Each write must satisfy the same conditions as [`core::ptr::write`].
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn scatter_ptr(self, dest: Simd<*mut T, LANES>) {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { self.scatter_select_ptr(dest, Mask::splat(true)) }
+    }
+
+    /// Conditionally write pointers elementwise into a SIMD vector vector.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If a lane is disabled, the writing that lane is skipped.
+    ///
+    /// # Safety
+    ///
+    /// Enabled lanes must satisfy the same conditions as [`core::ptr::write`].
+    #[inline]
+    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+    pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, LANES>, enable: Mask<isize, LANES>) {
+        // Safety: The caller is responsible for upholding all invariants
+        unsafe { intrinsics::simd_scatter(self, dest, enable.to_int()) }
+    }
 }
 
 impl<T, const LANES: usize> Copy for Simd<T, LANES>

From bef4c41fc0051444034ad9b488b06b2d512bfd17 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Fri, 11 Nov 2022 21:31:05 -0500
Subject: [PATCH 35/70] Add test examples

---
 crates/core_simd/src/vector.rs | 54 ++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 850a517c7990c..52ed54905199f 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -373,6 +373,19 @@ where
     /// # Safety
     ///
     /// Each read must satisfy the same conditions as [`core::ptr::read`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdConstPtr};
+    /// let values = [6, 2, 4, 9];
+    /// let offsets = Simd::from_array([1, 0, 0, 3]);
+    /// let source = Simd::splat(values.as_ptr()).wrapping_add(offsets);
+    /// let gathered = unsafe { Simd::gather_ptr(source) };
+    /// assert_eq!(gathered, Simd::from_array([2, 6, 6, 9]));
+    /// ```
     #[must_use]
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
@@ -392,6 +405,20 @@ where
     /// # Safety
     ///
     /// Enabled lanes must satisfy the same conditions as [`core::ptr::read`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Mask, Simd, SimdConstPtr};
+    /// let values = [6, 2, 4, 9];
+    /// let enable = Mask::from_array([true, true, false, true]);
+    /// let offsets = Simd::from_array([1, 0, 0, 3]);
+    /// let source = Simd::splat(values.as_ptr()).wrapping_add(offsets);
+    /// let gathered = unsafe { Simd::gather_select_ptr(source, enable, Simd::splat(0)) };
+    /// assert_eq!(gathered, Simd::from_array([2, 6, 0, 9]));
+    /// ```
     #[must_use]
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
@@ -519,6 +546,19 @@ where
     /// # Safety
     ///
     /// Each write must satisfy the same conditions as [`core::ptr::write`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, SimdMutPtr};
+    /// let mut values = [0; 4];
+    /// let offset = Simd::from_array([3, 2, 1, 0]);
+    /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
+    /// unsafe { Simd::from_array([6, 3, 5, 7]).scatter_ptr(ptrs); }
+    /// assert_eq!(values, [7, 5, 3, 6]);
+    /// ```
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn scatter_ptr(self, dest: Simd<*mut T, LANES>) {
@@ -533,6 +573,20 @@ where
     /// # Safety
     ///
     /// Enabled lanes must satisfy the same conditions as [`core::ptr::write`].
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Mask, Simd, SimdMutPtr};
+    /// let mut values = [0; 4];
+    /// let offset = Simd::from_array([3, 2, 1, 0]);
+    /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
+    /// let enable = Mask::from_array([true, true, false, false]);
+    /// unsafe { Simd::from_array([6, 3, 5, 7]).scatter_select_ptr(ptrs, enable); }
+    /// assert_eq!(values, [0, 0, 3, 6]);
+    /// ```
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, LANES>, enable: Mask<isize, LANES>) {

From c247915eb88af33302b2dc393fa7b488ee680a5f Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 12 Nov 2022 22:39:54 -0500
Subject: [PATCH 36/70] Update crates/core_simd/src/vector.rs

Co-authored-by: Jubilee <46493976+workingjubilee@users.noreply.github.com>
---
 crates/core_simd/src/vector.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 52ed54905199f..f25505f7c599f 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -568,7 +568,7 @@ where
 
     /// Conditionally write pointers elementwise into a SIMD vector vector.
     /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If a lane is disabled, the writing that lane is skipped.
+    /// If a lane is disabled, the write to that lane is skipped.
     ///
     /// # Safety
     ///

From 7e614f0438324b60af24554699977757228d7acd Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 12 Nov 2022 22:41:44 -0500
Subject: [PATCH 37/70] Fix typo typo

---
 crates/core_simd/src/vector.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index f25505f7c599f..0ddc3e1b3951c 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -368,7 +368,7 @@ where
         unsafe { Self::gather_select_ptr(ptrs, enable, or) }
     }
 
-    /// Read pointers elementwise into a SIMD vector vector.
+    /// Read pointers elementwise into a SIMD vector.
     ///
     /// # Safety
     ///
@@ -398,7 +398,7 @@ where
         unsafe { Self::gather_select_ptr(source, Mask::splat(true), Self::default()) }
     }
 
-    /// Conditionally read pointers elementwise into a SIMD vector vector.
+    /// Conditionally read pointers elementwise into a SIMD vector.
     /// The mask `enable`s all `true` lanes and disables all `false` lanes.
     /// If a lane is disabled, the lane is selected from the `or` vector and no read is performed.
     ///
@@ -541,7 +541,7 @@ where
         }
     }
 
-    /// Write pointers elementwise into a SIMD vector vector.
+    /// Write pointers elementwise into a SIMD vector.
     ///
     /// # Safety
     ///
@@ -566,7 +566,7 @@ where
         unsafe { self.scatter_select_ptr(dest, Mask::splat(true)) }
     }
 
-    /// Conditionally write pointers elementwise into a SIMD vector vector.
+    /// Conditionally write pointers elementwise into a SIMD vector.
     /// The mask `enable`s all `true` lanes and disables all `false` lanes.
     /// If a lane is disabled, the write to that lane is skipped.
     ///

From db8b23cea5ac9b45fafef65d95702f41cc02d486 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 27 Nov 2022 23:44:20 -0500
Subject: [PATCH 38/70] Remove reexport of simd::*

---
 crates/core_simd/src/lib.rs                   |  1 -
 crates/core_simd/tests/autoderef.rs           |  2 +-
 .../tests/mask_ops_impl/mask_macros.rs        |  2 +-
 crates/core_simd/tests/masks.rs               | 59 ++++++++++---------
 crates/core_simd/tests/ops_macros.rs          | 14 ++---
 crates/core_simd/tests/pointers.rs            |  2 +-
 crates/core_simd/tests/round.rs               |  2 +-
 crates/core_simd/tests/swizzle.rs             |  2 +-
 crates/core_simd/tests/to_bytes.rs            |  2 +-
 crates/core_simd/tests/try_from_slice.rs      |  2 +-
 crates/test_helpers/src/lib.rs                |  4 +-
 11 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 828731629692c..a6359d1e0be5c 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -21,4 +21,3 @@
 #[path = "mod.rs"]
 mod core_simd;
 pub use self::core_simd::simd;
-pub use simd::*;
diff --git a/crates/core_simd/tests/autoderef.rs b/crates/core_simd/tests/autoderef.rs
index 9359da16ee5c7..3181826ef59a6 100644
--- a/crates/core_simd/tests/autoderef.rs
+++ b/crates/core_simd/tests/autoderef.rs
@@ -1,6 +1,6 @@
 // Test that we handle all our "auto-deref" cases correctly.
 #![feature(portable_simd)]
-use core_simd::f32x4;
+use core_simd::simd::f32x4;
 
 #[cfg(target_arch = "wasm32")]
 use wasm_bindgen_test::*;
diff --git a/crates/core_simd/tests/mask_ops_impl/mask_macros.rs b/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
index 795f9e27c4458..faafa5fa51f18 100644
--- a/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
+++ b/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
@@ -2,7 +2,7 @@ macro_rules! mask_tests {
     { $vector:ident, $lanes:literal } => {
         #[cfg(test)]
         mod $vector {
-            use core_simd::$vector as Vector;
+            use core_simd::simd::$vector as Vector;
             const LANES: usize = $lanes;
 
             #[cfg(target_arch = "wasm32")]
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 673d0db93fee5..9f8bad1c36c08 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -13,11 +13,13 @@ macro_rules! test_mask_api {
             #[cfg(target_arch = "wasm32")]
             use wasm_bindgen_test::*;
 
+            use core_simd::simd::Mask;
+
             #[test]
             #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
             fn set_and_test() {
                 let values = [true, false, false, true, false, false, true, false];
-                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mut mask = Mask::<$type, 8>::splat(false);
                 for (lane, value) in values.iter().copied().enumerate() {
                     mask.set(lane, value);
                 }
@@ -29,7 +31,7 @@ macro_rules! test_mask_api {
             #[test]
             #[should_panic]
             fn set_invalid_lane() {
-                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mut mask = Mask::<$type, 8>::splat(false);
                 mask.set(8, true);
                 let _ = mask;
             }
@@ -37,24 +39,24 @@ macro_rules! test_mask_api {
             #[test]
             #[should_panic]
             fn test_invalid_lane() {
-                let mask = core_simd::Mask::<$type, 8>::splat(false);
+                let mask = Mask::<$type, 8>::splat(false);
                 let _ = mask.test(8);
             }
 
             #[test]
             fn any() {
-                assert!(!core_simd::Mask::<$type, 8>::splat(false).any());
-                assert!(core_simd::Mask::<$type, 8>::splat(true).any());
-                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                assert!(!Mask::<$type, 8>::splat(false).any());
+                assert!(Mask::<$type, 8>::splat(true).any());
+                let mut v = Mask::<$type, 8>::splat(false);
                 v.set(2, true);
                 assert!(v.any());
             }
 
             #[test]
             fn all() {
-                assert!(!core_simd::Mask::<$type, 8>::splat(false).all());
-                assert!(core_simd::Mask::<$type, 8>::splat(true).all());
-                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                assert!(!Mask::<$type, 8>::splat(false).all());
+                assert!(Mask::<$type, 8>::splat(true).all());
+                let mut v = Mask::<$type, 8>::splat(false);
                 v.set(2, true);
                 assert!(!v.all());
             }
@@ -62,57 +64,57 @@ macro_rules! test_mask_api {
             #[test]
             fn roundtrip_int_conversion() {
                 let values = [true, false, false, true, false, false, true, false];
-                let mask = core_simd::Mask::<$type, 8>::from_array(values);
+                let mask = Mask::<$type, 8>::from_array(values);
                 let int = mask.to_int();
                 assert_eq!(int.to_array(), [-1, 0, 0, -1, 0, 0, -1, 0]);
-                assert_eq!(core_simd::Mask::<$type, 8>::from_int(int), mask);
+                assert_eq!(Mask::<$type, 8>::from_int(int), mask);
             }
 
             #[test]
             fn roundtrip_bitmask_conversion() {
-                use core_simd::ToBitMask;
+                use core_simd::simd::ToBitMask;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let mask = Mask::<$type, 16>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b1000001101001001);
-                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 16>::from_bitmask(bitmask), mask);
             }
 
             #[test]
             fn roundtrip_bitmask_conversion_short() {
-                use core_simd::ToBitMask;
+                use core_simd::simd::ToBitMask;
 
                 let values = [
                     false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 4>::from_array(values);
+                let mask = Mask::<$type, 4>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b1000);
-                assert_eq!(core_simd::Mask::<$type, 4>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 4>::from_bitmask(bitmask), mask);
 
                 let values = [true, false];
-                let mask = core_simd::Mask::<$type, 2>::from_array(values);
+                let mask = Mask::<$type, 2>::from_array(values);
                 let bitmask = mask.to_bitmask();
                 assert_eq!(bitmask, 0b01);
-                assert_eq!(core_simd::Mask::<$type, 2>::from_bitmask(bitmask), mask);
+                assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
             }
 
             #[test]
             fn cast() {
-                fn cast_impl<T: core_simd::MaskElement>()
+                fn cast_impl<T: core_simd::simd::MaskElement>()
                 where
-                    core_simd::Mask<$type, 8>: Into<core_simd::Mask<T, 8>>,
+                    Mask<$type, 8>: Into<Mask<T, 8>>,
                 {
                     let values = [true, false, false, true, false, false, true, false];
-                    let mask = core_simd::Mask::<$type, 8>::from_array(values);
+                    let mask = Mask::<$type, 8>::from_array(values);
 
                     let cast_mask = mask.cast::<T>();
                     assert_eq!(values, cast_mask.to_array());
 
-                    let into_mask: core_simd::Mask<T, 8> = mask.into();
+                    let into_mask: Mask<T, 8> = mask.into();
                     assert_eq!(values, into_mask.to_array());
                 }
 
@@ -126,15 +128,15 @@ macro_rules! test_mask_api {
             #[cfg(feature = "generic_const_exprs")]
             #[test]
             fn roundtrip_bitmask_array_conversion() {
-                use core_simd::ToBitMaskArray;
+                use core_simd::simd::ToBitMaskArray;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
-                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let mask = Mask::<$type, 16>::from_array(values);
                 let bitmask = mask.to_bitmask_array();
                 assert_eq!(bitmask, [0b01001001, 0b10000011]);
-                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
+                assert_eq!(Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
             }
         }
     }
@@ -150,9 +152,10 @@ mod mask_api {
 
 #[test]
 fn convert() {
+    use core_simd::simd::Mask;
     let values = [true, false, false, true, false, false, true, false];
     assert_eq!(
-        core_simd::Mask::<i8, 8>::from_array(values),
-        core_simd::Mask::<i32, 8>::from_array(values).into()
+        Mask::<i8, 8>::from_array(values),
+        Mask::<i32, 8>::from_array(values).into()
     );
 }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index f759394d07582..3a02f3f01e1cf 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -7,7 +7,7 @@ macro_rules! impl_unary_op_test {
         test_helpers::test_lanes! {
             fn $fn<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
-                    &<core_simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
+                    &<core_simd::simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
                     &$scalar_fn,
                     &|_| true,
                 );
@@ -27,7 +27,7 @@ macro_rules! impl_binary_op_test {
     { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr } => {
         mod $fn {
             use super::*;
-            use core_simd::Simd;
+            use core_simd::simd::Simd;
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
@@ -64,7 +64,7 @@ macro_rules! impl_binary_checked_op_test {
     { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr, $check_fn:expr } => {
         mod $fn {
             use super::*;
-            use core_simd::Simd;
+            use core_simd::simd::Simd;
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
@@ -173,7 +173,7 @@ macro_rules! impl_signed_tests {
     { $scalar:tt } => {
         mod $scalar {
             use core_simd::simd::SimdInt;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_common_integer_tests! { Vector, Scalar }
@@ -314,7 +314,7 @@ macro_rules! impl_unsigned_tests {
     { $scalar:tt } => {
         mod $scalar {
             use core_simd::simd::SimdUint;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_common_integer_tests! { Vector, Scalar }
@@ -348,8 +348,8 @@ macro_rules! impl_unsigned_tests {
 macro_rules! impl_float_tests {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
-            use core_simd::SimdFloat;
-            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            use core_simd::simd::SimdFloat;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
             impl_unary_op_test!(Scalar, Neg::neg);
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 2b0008624ad8a..0ae8f83b8b97d 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -1,6 +1,6 @@
 #![feature(portable_simd, strict_provenance)]
 
-use core_simd::{Simd, SimdConstPtr, SimdMutPtr};
+use core_simd::simd::{Simd, SimdConstPtr, SimdMutPtr};
 
 macro_rules! common_tests {
     { $constness:ident } => {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 484fd5bf47d1b..8b9638ad46671 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -5,7 +5,7 @@ macro_rules! float_rounding_test {
         mod $scalar {
             use std_float::StdFloat;
 
-            type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
+            type Vector<const LANES: usize> = core_simd::simd::Simd<$scalar, LANES>;
             type Scalar = $scalar;
             type IntScalar = $int_scalar;
 
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 33a7becb42128..8cd7c33e823fb 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -1,5 +1,5 @@
 #![feature(portable_simd)]
-use core_simd::{Simd, Swizzle};
+use core_simd::simd::{Simd, Swizzle};
 
 #[cfg(target_arch = "wasm32")]
 use wasm_bindgen_test::*;
diff --git a/crates/core_simd/tests/to_bytes.rs b/crates/core_simd/tests/to_bytes.rs
index debb4335e2c96..be0ee4349c579 100644
--- a/crates/core_simd/tests/to_bytes.rs
+++ b/crates/core_simd/tests/to_bytes.rs
@@ -2,7 +2,7 @@
 #![allow(incomplete_features)]
 #![cfg(feature = "generic_const_exprs")]
 
-use core_simd::Simd;
+use core_simd::simd::Simd;
 
 #[test]
 fn byte_convert() {
diff --git a/crates/core_simd/tests/try_from_slice.rs b/crates/core_simd/tests/try_from_slice.rs
index 189c18c6039a0..859e3b94f2cd4 100644
--- a/crates/core_simd/tests/try_from_slice.rs
+++ b/crates/core_simd/tests/try_from_slice.rs
@@ -6,7 +6,7 @@ use wasm_bindgen_test::*;
 #[cfg(target_arch = "wasm32")]
 wasm_bindgen_test_configure!(run_in_browser);
 
-use core_simd::i32x4;
+use core_simd::simd::i32x4;
 
 #[test]
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 5f2a928b5e4bb..b26cdc311a215 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -401,7 +401,7 @@ macro_rules! test_lanes {
 
                 fn implementation<const $lanes: usize>()
                 where
-                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                    core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
                 #[cfg(target_arch = "wasm32")]
@@ -508,7 +508,7 @@ macro_rules! test_lanes_panic {
 
                 fn implementation<const $lanes: usize>()
                 where
-                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                    core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
                 $crate::test_lanes_helper!(

From 54b6f6923e281ba68d13269b43faa927c6df83d5 Mon Sep 17 00:00:00 2001
From: Thom Chiovoloni <thom@shift.click>
Date: Mon, 28 Nov 2022 06:03:32 -0800
Subject: [PATCH 39/70] Avoid a scalar loop in `Simd::from_slice`

---
 crates/core_simd/src/lib.rs    |  1 +
 crates/core_simd/src/vector.rs | 11 ++++-------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 828731629692c..34b79e630a482 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,5 +1,6 @@
 #![no_std]
 #![feature(
+    const_ptr_read,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index d109087eaa655..51b0d999a8137 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -174,13 +174,10 @@ where
             slice.len() >= LANES,
             "slice length must be at least the number of lanes"
         );
-        let mut array = [slice[0]; LANES];
-        let mut i = 0;
-        while i < LANES {
-            array[i] = slice[i];
-            i += 1;
-        }
-        Self(array)
+        // Safety:
+        // - We've checked the length is sufficient.
+        // - `T` and `Simd<T, N>` are Copy types.
+        unsafe { slice.as_ptr().cast::<Self>().read_unaligned() }
     }
 
     /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.

From df3a63906c44b23de7065d60c20bf99e2571ccc8 Mon Sep 17 00:00:00 2001
From: miguel raz <miguelraz@gmail.com>
Date: Fri, 4 Jun 2021 14:24:47 -0500
Subject: [PATCH 40/70] add dot_product example

---
 crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 crates/core_simd/examples/dot_product.rs

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
new file mode 100644
index 0000000000000..812b0b23eebff
--- /dev/null
+++ b/crates/core_simd/examples/dot_product.rs
@@ -0,0 +1,31 @@
+// Code taken from the `packed_simd` crate
+// Run this code with `cargo test --example dot_product`
+#![feature(array_chunks)]
+use core_simd::*;
+
+pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| (a * b).horizontal_sum())
+        .sum()
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test() {
+        use super::*;
+        let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
+        let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+
+        assert_eq!(0.0, dot_prod(&a, &b));
+    }
+}

From c08a4d1f10473bfbdddf3d2eefc40e1194a633a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Sat, 26 Mar 2022 14:04:37 -0600
Subject: [PATCH 41/70] add more basic dot products and comments, README

---
 crates/core_simd/examples/README.md      | 19 ++++++++++++++++
 crates/core_simd/examples/dot_product.rs | 29 +++++++++++++++++++++---
 2 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 crates/core_simd/examples/README.md

diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
new file mode 100644
index 0000000000000..b37dffa8eaab3
--- /dev/null
+++ b/crates/core_simd/examples/README.md
@@ -0,0 +1,19 @@
+### `stdsimd` examples
+
+This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate.
+
+The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs.
+
+Run the tests with the command 
+
+```
+cargo run --example dot_product
+```
+
+and the benchmarks via the command
+
+```
+cargo run --example --benchmark ???
+```
+
+and measure the timings on your local system.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 812b0b23eebff..3e415fc4471dc 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -3,7 +3,27 @@
 #![feature(array_chunks)]
 use core_simd::*;
 
-pub fn dot_prod(a: &[f32], b: &[f32]) -> f32 {
+/// This is your barebones dot product implementation: 
+/// Take 2 vectors, multiply them element wise and *then*
+/// add up the result. In the next example we will see if there
+///  is any difference to adding as we go along multiplying.
+pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    a.iter()
+    .zip(b.iter())
+    .map(|a, b| a * b)
+    .sum()
+}
+
+pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    a.iter()
+    .zip(b.iter())
+    .fold(0.0, |a, b| a * b)
+}
+
+pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
     // TODO handle remainder when a.len() % 4 != 0
@@ -21,11 +41,14 @@ fn main() {
 #[cfg(test)]
 mod tests {
     #[test]
-    fn test() {
+    fn smoke_test() {
         use super::*;
         let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
         let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
 
-        assert_eq!(0.0, dot_prod(&a, &b));
+        assert_eq!(0.0, dot_prod_0(&a, &b));
+        assert_eq!(0.0, dot_prod_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_0(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_1(&a, &b));
     }
 }

From 4615805ec2ce44c37792df3b5b179a795f57542b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Sat, 26 Mar 2022 16:10:25 -0600
Subject: [PATCH 42/70] add remainder dot_product and cleanup

cleanup dot_product and README.md
---
 crates/core_simd/examples/README.md      |   8 +-
 crates/core_simd/examples/dot_product.rs | 106 ++++++++++++++++++++---
 2 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/crates/core_simd/examples/README.md b/crates/core_simd/examples/README.md
index b37dffa8eaab3..82747f1b5a6f9 100644
--- a/crates/core_simd/examples/README.md
+++ b/crates/core_simd/examples/README.md
@@ -10,10 +10,4 @@ Run the tests with the command
 cargo run --example dot_product
 ```
 
-and the benchmarks via the command
-
-```
-cargo run --example --benchmark ???
-```
-
-and measure the timings on your local system.
+and verify the code for `dot_product.rs` on your machine.
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 3e415fc4471dc..ed210192e2a4b 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -1,39 +1,113 @@
 // Code taken from the `packed_simd` crate
 // Run this code with `cargo test --example dot_product`
+//use std::iter::zip;
+
 #![feature(array_chunks)]
+#![feature(slice_as_chunks)]
+// Add these imports to use the stdsimd library
+#![feature(portable_simd)]
 use core_simd::*;
 
-/// This is your barebones dot product implementation: 
-/// Take 2 vectors, multiply them element wise and *then*
-/// add up the result. In the next example we will see if there
-///  is any difference to adding as we go along multiplying.
+// This is your barebones dot product implementation:
+// Take 2 vectors, multiply them element wise and *then*
+// go along the resulting array and add up the result.
+// In the next example we will see if there
+//  is any difference to adding and multiplying in tandem.
 pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
-    a.iter()
-    .zip(b.iter())
-    .map(|a, b| a * b)
-    .sum()
+    a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
 }
 
+// When dealing with SIMD, it is very important to think about the amount
+// of data movement and when it happens. We're going over simple computation examples here, and yet
+// it is not trivial to understand what may or may not contribute to performance
+// changes. Eventually, you will need tools to inspect the generated assembly and confirm your
+// hypothesis and benchmarks - we will mention them later on.
+// With the use of `fold`, we're doing a multiplication,
+// and then adding it to the sum, one element from both vectors at a time.
 pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     a.iter()
-    .zip(b.iter())
-    .fold(0.0, |a, b| a * b)
+        .zip(b.iter())
+        .fold(0.0, |a, zipped| a + zipped.0 * zipped.1)
 }
 
+// We now move on to the SIMD implementations: notice the following constructs:
+// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors
+// `f32x4::from_array`: construct the SIMD vector from a slice
+// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
+// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
+// and then add those with `sum()`. This is suboptimal.
+// TODO: ASCII diagrams
 pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
-
     // TODO handle remainder when a.len() % 4 != 0
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .map(|(a, b)| (a * b).horizontal_sum())
+        .map(|(a, b)| (a * b).reduce_sum())
         .sum()
 }
 
+// There's some simple ways to improve the previous code:
+// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into
+// So that there is only one `sum()` reduction when the last `f32x4` has been processed
+// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton
+// happen in the same step.
+// If the arrays are large, minimizing the data shuffling will lead to great perf.
+// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4
+// Can become a problem.
+pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1)
+        .reduce_sum()
+}
+
+// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are
+// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for.
+use std_float::StdFloat;
+pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+    // TODO handle remainder when a.len() % 4 != 0
+    let mut res = f32x4::splat(0.0);
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .for_each(|(a, b)| {
+            res = a.mul_add(b, res);
+        });
+    res.reduce_sum()
+}
+
+// Finally, we will write the same operation but handling the loop remainder.
+const LANES: usize = 4;
+pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
+    assert_eq!(a.len(), b.len());
+
+    let (a_extra, a_chunks) = a.as_rchunks();
+    let (b_extra, b_chunks) = b.as_rchunks();
+
+    // These are always true, but for emphasis:
+    assert_eq!(a_chunks.len(), b_chunks.len());
+    assert_eq!(a_extra.len(), b_extra.len());
+
+    let mut sums = [0.0; LANES];
+    for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) {
+        *d = x * y;
+    }
+
+    let mut sums = f32x4::from_array(sums);
+    std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| {
+        sums += f32x4::from_array(*x) * f32x4::from_array(*y);
+    });
+
+    sums.reduce_sum()
+}
 fn main() {
     // Empty main to make cargo happy
 }
@@ -45,10 +119,18 @@ mod tests {
         use super::*;
         let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
         let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
+        let x: Vec<f32> = [0.5; 1003].to_vec();
+        let y: Vec<f32> = [2.0; 1003].to_vec();
 
+        // Basic check
         assert_eq!(0.0, dot_prod_0(&a, &b));
         assert_eq!(0.0, dot_prod_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_2(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+
+        // We can handle vectors that are non-multiples of 4
+        assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
     }
 }

From 4ddfd2f3f8c547fa7c42a0f9a5979665262a30c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Tue, 29 Mar 2022 16:52:54 -0600
Subject: [PATCH 43/70] non allocating fold simd

allocating fold with std::ops::Add::add
---
 crates/core_simd/examples/dot_product.rs | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index ed210192e2a4b..75d628ee39223 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -108,6 +108,37 @@ pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
 
     sums.reduce_sum()
 }
+
+// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
+// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
+// next example.
+pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
+    let mut sum = a
+        .array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .map(|(a, b)| a * b)
+        .fold(f32x4::splat(0.0), std::ops::Add::add)
+        .reduce_sum();
+    let remain = a.len() - (a.len() % 4);
+    sum += a[remain..]
+        .iter()
+        .zip(&b[remain..])
+        .map(|(a, b)| a * b)
+        .sum::<f32>();
+    sum
+}
+
+// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
+// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
+pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
+    a.array_chunks::<4>()
+        .map(|&a| f32x4::from_array(a))
+        .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
+        .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
+        .reduce_sum()
+}
+
 fn main() {
     // Empty main to make cargo happy
 }

From aeac9ed37339c463a6a155b12135b7f167611e26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Tue, 29 Mar 2022 17:36:47 -0600
Subject: [PATCH 44/70] proper mul_add arg order, added tests

---
 crates/core_simd/examples/dot_product.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 75d628ee39223..84824c2e5c4a9 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -135,7 +135,7 @@ pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))
         .zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
-        .fold(f32x4::splat(0.), |acc, (a, b)| acc.mul_add(a, b))
+        .fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc))
         .reduce_sum()
 }
 
@@ -160,6 +160,8 @@ mod tests {
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_2(&a, &b));
         assert_eq!(0.0, dot_prod_simd_3(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_4(&a, &b));
+        assert_eq!(0.0, dot_prod_simd_5(&a, &b));
 
         // We can handle vectors that are non-multiples of 4
         assert_eq!(1003.0, dot_prod_simd_3(&x, &y));

From 64247a327d30a2d5fe7ad3d98f527bff1cc8fb85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= <miguelraz@gmail.com>
Date: Wed, 30 Mar 2022 17:45:59 -0600
Subject: [PATCH 45/70] add _scalar names for dot_product examples

---
 crates/core_simd/examples/dot_product.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 84824c2e5c4a9..936741a2ceb63 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -13,7 +13,7 @@ use core_simd::*;
 // go along the resulting array and add up the result.
 // In the next example we will see if there
 //  is any difference to adding and multiplying in tandem.
-pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
+pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
 
     a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
@@ -26,7 +26,7 @@ pub fn dot_prod_0(a: &[f32], b: &[f32]) -> f32 {
 // hypothesis and benchmarks - we will mention them later on.
 // With the use of `fold`, we're doing a multiplication,
 // and then adding it to the sum, one element from both vectors at a time.
-pub fn dot_prod_1(a: &[f32], b: &[f32]) -> f32 {
+pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
     assert_eq!(a.len(), b.len());
     a.iter()
         .zip(b.iter())
@@ -154,8 +154,8 @@ mod tests {
         let y: Vec<f32> = [2.0; 1003].to_vec();
 
         // Basic check
-        assert_eq!(0.0, dot_prod_0(&a, &b));
-        assert_eq!(0.0, dot_prod_1(&a, &b));
+        assert_eq!(0.0, dot_prod_scalar_0(&a, &b));
+        assert_eq!(0.0, dot_prod_scalar_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_0(&a, &b));
         assert_eq!(0.0, dot_prod_simd_1(&a, &b));
         assert_eq!(0.0, dot_prod_simd_2(&a, &b));

From da3bd6d3a04f84ebc7fc6314f2e1f8a74e379018 Mon Sep 17 00:00:00 2001
From: The Atelier <workingjubilee@gmail.com>
Date: Sat, 3 Dec 2022 18:40:07 -0800
Subject: [PATCH 46/70] Update dot_product example import

---
 crates/core_simd/examples/dot_product.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 936741a2ceb63..391f08f55a07a 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -6,7 +6,7 @@
 #![feature(slice_as_chunks)]
 // Add these imports to use the stdsimd library
 #![feature(portable_simd)]
-use core_simd::*;
+use core_simd::simd::*;
 
 // This is your barebones dot product implementation:
 // Take 2 vectors, multiply them element wise and *then*

From e3ef226f7b33e7257d0e549046bed44cabfd5585 Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Mon, 23 Jan 2023 11:00:35 +0700
Subject: [PATCH 47/70] Fix the typo

---
 crates/core_simd/src/masks/to_bitmask.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
index 46914dfe0d9b4..fc7d6b781f2f5 100644
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ b/crates/core_simd/src/masks/to_bitmask.rs
@@ -72,7 +72,7 @@ impl_integer_intrinsic! {
     impl ToBitMask<BitMask=u64> for Mask<_, 64>
 }
 
-/// Returns the minimum numnber of bytes in a bitmask with `lanes` lanes.
+/// Returns the minimum number of bytes in a bitmask with `lanes` lanes.
 #[cfg(feature = "generic_const_exprs")]
 pub const fn bitmask_len(lanes: usize) -> usize {
     (lanes + 7) / 8

From 0fd7c8e138db1362e3cba9cdb40403dc7a83364b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 19 Feb 2023 12:21:27 -0500
Subject: [PATCH 48/70] Add copy_to_slice

---
 crates/core_simd/src/vector.rs | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 51b0d999a8137..870c2eefee153 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -159,7 +159,7 @@ where
     ///
     /// Panics if the slice's length is less than the vector's `Simd::LANES`.
     ///
-    /// # Examples
+    /// # Example
     ///
     /// ```
     /// # #![feature(portable_simd)]
@@ -180,6 +180,35 @@ where
         unsafe { slice.as_ptr().cast::<Self>().read_unaligned() }
     }
 
+    /// Writes a SIMD vector to the first `LANES` elements of a slice.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the slice's length is less than the vector's `Simd::LANES`.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let mut dest = vec![0; 6];
+    /// let v = u32x4::from_array([1, 2, 3, 4]);
+    /// v.copy_to_slice(&mut dest);
+    /// assert_eq!(&dest, &[1, 2, 3, 4, 0, 0]);
+    /// ```
+    pub fn copy_to_slice(self, slice: &mut [T]) {
+        assert!(
+            slice.len() >= LANES,
+            "slice length must be at least the number of lanes"
+        );
+        // Safety:
+        // - We've checked the length is sufficient
+        // - `T` and `Simd<T, N>` are Copy types.
+        unsafe { slice.as_mut_ptr().cast::<Self>().write_unaligned(self) }
+    }
+
     /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
     ///
     /// This follows the semantics of Rust's `as` conversion for casting

From 36829ddca7de02b4d8bad31bdfb0fbc83664017b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 19 Feb 2023 15:35:36 -0500
Subject: [PATCH 49/70] Check that vectors aren't padded

---
 crates/core_simd/src/vector.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 870c2eefee153..3e39f1d623ce4 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -174,6 +174,7 @@ where
             slice.len() >= LANES,
             "slice length must be at least the number of lanes"
         );
+        assert!(core::mem::size_of::<Self>() == LANES * core::mem::size_of::<T>());
         // Safety:
         // - We've checked the length is sufficient.
         // - `T` and `Simd<T, N>` are Copy types.
@@ -203,6 +204,7 @@ where
             slice.len() >= LANES,
             "slice length must be at least the number of lanes"
         );
+        assert!(core::mem::size_of::<Self>() == LANES * core::mem::size_of::<T>());
         // Safety:
         // - We've checked the length is sufficient
         // - `T` and `Simd<T, N>` are Copy types.

From 65b5210bdbb3a7af57e5c39d41424ba260ee3fbc Mon Sep 17 00:00:00 2001
From: bjorn3 <17426603+bjorn3@users.noreply.github.com>
Date: Sun, 26 Mar 2023 12:56:35 +0200
Subject: [PATCH 50/70] Skip building wasm-bindgen-test on non-wasm targets

This reduces compilation time
---
 crates/core_simd/Cargo.toml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index 7435e24edd30f..d1a3a515a7e81 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -15,11 +15,9 @@ std = []
 generic_const_exprs = []
 all_lane_counts = []
 
-[target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen]
-version = "0.2"
-
-[dev-dependencies.wasm-bindgen-test]
-version = "0.3"
+[target.'cfg(target_arch = "wasm32")'.dev-dependencies]
+wasm-bindgen = "0.2"
+wasm-bindgen-test = "0.3"
 
 [dev-dependencies.proptest]
 version = "0.10"

From 90f2af774ae3149ad52ec6bb2d48649b72844a2c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 26 Mar 2023 16:11:05 -0400
Subject: [PATCH 51/70] Fix lint

---
 crates/test_helpers/src/array.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/test_helpers/src/array.rs b/crates/test_helpers/src/array.rs
index 5ffc922697694..984a427320deb 100644
--- a/crates/test_helpers/src/array.rs
+++ b/crates/test_helpers/src/array.rs
@@ -41,6 +41,7 @@ where
 
     fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
         let tree: [S::Tree; LANES] = unsafe {
+            #[allow(clippy::uninit_assumed_init)]
             let mut tree: [MaybeUninit<S::Tree>; LANES] = MaybeUninit::uninit().assume_init();
             for t in tree.iter_mut() {
                 *t = MaybeUninit::new(self.strategy.new_tree(runner)?)
@@ -60,6 +61,7 @@ impl<T: ValueTree, const LANES: usize> ValueTree for ArrayValueTree<[T; LANES]>
 
     fn current(&self) -> Self::Value {
         unsafe {
+            #[allow(clippy::uninit_assumed_init)]
             let mut value: [MaybeUninit<T::Value>; LANES] = MaybeUninit::uninit().assume_init();
             for (tree_elem, value_elem) in self.tree.iter().zip(value.iter_mut()) {
                 *value_elem = MaybeUninit::new(tree_elem.current());

From ceb26115928c5c69b10268fd2f9e500865c142d6 Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Sun, 9 Apr 2023 21:26:40 -0700
Subject: [PATCH 52/70] Remove formats `[T; N]` does not impl
 (rust-lang/portable-simd#337)

Remove these extra formatting traits, as they are
inconsistent with how arrays and slices format,
and it can cause unnecessary code bloat in binaries.
We can revisit this if people ever agree on doing these
formatters for the other slice-y types.

Prefer to dispatch to the `impl `fmt::Debug for [T]`,
to reduce the chances of monomorphizing twice.
Inlining it seems like a good idea for similar reasons?
---
 crates/core_simd/src/fmt.rs | 50 ++++++++++++-------------------------
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/crates/core_simd/src/fmt.rs b/crates/core_simd/src/fmt.rs
index dbd9839c4bfe9..b7317969cbb49 100644
--- a/crates/core_simd/src/fmt.rs
+++ b/crates/core_simd/src/fmt.rs
@@ -1,39 +1,21 @@
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::fmt;
 
-macro_rules! impl_fmt_trait {
-    { $($trait:ident,)* } => {
-        $(
-            impl<T, const LANES: usize> fmt::$trait for Simd<T, LANES>
-            where
-                LaneCount<LANES>: SupportedLaneCount,
-                T: SimdElement + fmt::$trait,
-            {
-                fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-                    #[repr(transparent)]
-                    struct Wrapper<'a, T: fmt::$trait>(&'a T);
-
-                    impl<T: fmt::$trait> fmt::Debug for Wrapper<'_, T> {
-                        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-                            self.0.fmt(f)
-                        }
-                    }
-
-                    f.debug_list()
-                        .entries(self.as_array().iter().map(|x| Wrapper(x)))
-                        .finish()
-                }
-            }
-        )*
+impl<T, const LANES: usize> fmt::Debug for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + fmt::Debug,
+{
+    /// A `Simd<T, N>` has a debug format like the one for `[T]`:
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd::Simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd::Simd;
+    /// let floats = Simd::<f32, 4>::splat(-1.0);
+    /// assert_eq!(format!("{:?}", [-1.0; 4]), format!("{:?}", floats));
+    /// ```
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        <[T] as fmt::Debug>::fmt(self.as_array(), f)
     }
 }
-
-impl_fmt_trait! {
-    Debug,
-    Binary,
-    LowerExp,
-    UpperExp,
-    Octal,
-    LowerHex,
-    UpperHex,
-}

From afad9c3f644ddbfef3301f617cb9d23ca4e71fe0 Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling@gmail.com>
Date: Sat, 22 Apr 2023 21:12:35 +0000
Subject: [PATCH 53/70] Don't use direct field access in `Simd` functions

---
 crates/core_simd/src/lib.rs    |  2 ++
 crates/core_simd/src/vector.rs | 26 ++++++++++++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 927b1654f8e8e..a372e2e40c475 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,6 +1,8 @@
 #![no_std]
 #![feature(
     const_ptr_read,
+    const_refs_to_cell,
+    const_transmute_copy,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3e39f1d623ce4..c1af4af5f57e7 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -135,22 +135,32 @@ where
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
     pub const fn as_array(&self) -> &[T; LANES] {
-        &self.0
+        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+        // is always valid and `Simd<T, LANES>` never has a lower alignment
+        // than `[T; LANES]`.
+        unsafe { &*(self as *const Self as *const [T; LANES]) }
     }
 
     /// Returns a mutable array reference containing the entire SIMD vector.
     pub fn as_mut_array(&mut self) -> &mut [T; LANES] {
-        &mut self.0
+        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+        // is always valid and `Simd<T, LANES>` never has a lower alignment
+        // than `[T; LANES]`.
+        unsafe { &mut *(self as *mut Self as *mut [T; LANES]) }
     }
 
     /// Converts an array to a SIMD vector.
     pub const fn from_array(array: [T; LANES]) -> Self {
-        Self(array)
+        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+        // is always valid.
+        unsafe { core::mem::transmute_copy(&array) }
     }
 
     /// Converts a SIMD vector to an array.
     pub const fn to_array(self) -> [T; LANES] {
-        self.0
+        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+        // is always valid.
+        unsafe { core::mem::transmute_copy(&self) }
     }
 
     /// Converts a slice to a SIMD vector containing `slice[..LANES]`.
@@ -735,7 +745,7 @@ where
 {
     #[inline]
     fn as_ref(&self) -> &[T; LANES] {
-        &self.0
+        self.as_array()
     }
 }
 
@@ -746,7 +756,7 @@ where
 {
     #[inline]
     fn as_mut(&mut self) -> &mut [T; LANES] {
-        &mut self.0
+        self.as_mut_array()
     }
 }
 
@@ -758,7 +768,7 @@ where
 {
     #[inline]
     fn as_ref(&self) -> &[T] {
-        &self.0
+        self.as_array()
     }
 }
 
@@ -769,7 +779,7 @@ where
 {
     #[inline]
     fn as_mut(&mut self) -> &mut [T] {
-        &mut self.0
+        self.as_mut_array()
     }
 }
 

From 52833ccbe88ed98b73d0ccd7299f2a667439bb4b Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling@gmail.com>
Date: Sat, 22 Apr 2023 23:02:45 +0000
Subject: [PATCH 54/70] Add notes to avoid direct field accesses

---
 crates/core_simd/src/vector.rs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index c1af4af5f57e7..eee105ff5fc67 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -76,6 +76,11 @@ use crate::simd::{
 /// [`read`]: pointer::read
 /// [`write`]: pointer::write
 /// [as_simd]: slice::as_simd
+//
+// NOTE: Accessing the inner array directly in any way (e.g. by using the `.0` field syntax) or
+// directly constructing an instance of the type (i.e. `let vector = Simd(array)`) should be
+// avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
+// causes rustc to emit illegal LLVM IR in some cases.
 #[repr(simd)]
 pub struct Simd<T, const LANES: usize>([T; LANES])
 where
@@ -138,6 +143,9 @@ where
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
         // is always valid and `Simd<T, LANES>` never has a lower alignment
         // than `[T; LANES]`.
+        //
+        // NOTE: This deliberately doesn't just use `&self.0`, see the comment
+        // on the struct definition for details.
         unsafe { &*(self as *const Self as *const [T; LANES]) }
     }
 
@@ -146,6 +154,9 @@ where
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
         // is always valid and `Simd<T, LANES>` never has a lower alignment
         // than `[T; LANES]`.
+        //
+        // NOTE: This deliberately doesn't just use `&mut self.0`, see the comment
+        // on the struct definition for details.
         unsafe { &mut *(self as *mut Self as *mut [T; LANES]) }
     }
 
@@ -153,6 +164,9 @@ where
     pub const fn from_array(array: [T; LANES]) -> Self {
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
         // is always valid.
+        //
+        // NOTE: This deliberately doesn't just use `Self(array)`, see the comment
+        // on the struct definition for details.
         unsafe { core::mem::transmute_copy(&array) }
     }
 
@@ -160,6 +174,9 @@ where
     pub const fn to_array(self) -> [T; LANES] {
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
         // is always valid.
+        //
+        // NOTE: This deliberately doesn't just use `self.0`, see the comment
+        // on the struct definition for details.
         unsafe { core::mem::transmute_copy(&self) }
     }
 

From f1b86baf8453733c72e196ce2c08b4d85e94d81a Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling@gmail.com>
Date: Sat, 22 Apr 2023 23:22:39 +0000
Subject: [PATCH 55/70] Use pointer reads for better codegen in debug mode

---
 crates/core_simd/src/lib.rs    |  1 -
 crates/core_simd/src/vector.rs | 18 ++++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index a372e2e40c475..e054d483ca5d2 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -2,7 +2,6 @@
 #![feature(
     const_ptr_read,
     const_refs_to_cell,
-    const_transmute_copy,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index eee105ff5fc67..a38d701588ce0 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -163,21 +163,31 @@ where
     /// Converts an array to a SIMD vector.
     pub const fn from_array(array: [T; LANES]) -> Self {
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
-        // is always valid.
+        // is always valid. We need to use `read_unaligned` here, since
+        // the array may have a lower alignment than the vector.
+        //
+        // FIXME: We currently use a pointer read instead of `transmute_copy` because
+        // it results in better codegen with optimizations disabled, but we should
+        // probably just use `transmute` once that works on const generic types.
         //
         // NOTE: This deliberately doesn't just use `Self(array)`, see the comment
         // on the struct definition for details.
-        unsafe { core::mem::transmute_copy(&array) }
+        unsafe { (&array as *const [T; LANES] as *const Self).read_unaligned() }
     }
 
     /// Converts a SIMD vector to an array.
     pub const fn to_array(self) -> [T; LANES] {
         // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
-        // is always valid.
+        // is always valid. No need to use `read_unaligned` here, since
+        // the vector never has a lower alignment than the array.
+        //
+        // FIXME: We currently use a pointer read instead of `transmute_copy` because
+        // it results in better codegen with optimizations disabled, but we should
+        // probably just use `transmute` once that works on const generic types.
         //
         // NOTE: This deliberately doesn't just use `self.0`, see the comment
         // on the struct definition for details.
-        unsafe { core::mem::transmute_copy(&self) }
+        unsafe { (&self as *const Self as *const [T; LANES]).read() }
     }
 
     /// Converts a slice to a SIMD vector containing `slice[..LANES]`.

From 71d4c368509536f7277e9a1cb6e6286ba6de7911 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Fri, 17 Mar 2023 17:56:45 -0700
Subject: [PATCH 56/70] lane -> element for core::simd::Simd

A while ago we began saying T, N instead of T, LANES in reference to Simd.
At some point that leaked in to us checking in code with const N: usize.
After a while, we had a discussion and agreed that "lanes", while common,
is unnecessary jargon for Rust learners who aren't familiar with SIMD, and
is fully interchangeable with terms for arrays like element and index.

But we never acted on that. Let's update the main type's docs, at least.
The example tweaks also enable removing a slated-for-removal nightly fn.
---
 crates/core_simd/src/vector.rs | 390 +++++++++++++++++----------------
 1 file changed, 199 insertions(+), 191 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index a38d701588ce0..154b467752b6d 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -3,48 +3,55 @@ use crate::simd::{
     SimdPartialOrd, SupportedLaneCount, Swizzle,
 };
 
-/// A SIMD vector of `LANES` elements of type `T`. `Simd<T, N>` has the same shape as [`[T; N]`](array), but operates like `T`.
+/// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
 ///
-/// Two vectors of the same type and length will, by convention, support the operators (+, *, etc.) that `T` does.
-/// These take the lanes at each index on the left-hand side and right-hand side, perform the operation,
-/// and return the result in the same lane in a vector of equal size. For a given operator, this is equivalent to zipping
-/// the two arrays together and mapping the operator over each lane.
+/// `Simd<T, N>` supports the operators (+, *, etc.) that `T` does in "elementwise" fashion.
+/// These take the element at each index from the left-hand side and right-hand side,
+/// perform the operation, then return the result in the same index in a vector of equal size.
+/// In other words, an elementwise operation is equivalent to a zip, then map.
 ///
 /// ```rust
-/// # #![feature(array_zip, portable_simd)]
+/// # #![feature(portable_simd)]
 /// # use core::simd::{Simd};
-/// let a0: [i32; 4] = [-2, 0, 2, 4];
-/// let a1 = [10, 9, 8, 7];
-/// let zm_add = a0.zip(a1).map(|(lhs, rhs)| lhs + rhs);
-/// let zm_mul = a0.zip(a1).map(|(lhs, rhs)| lhs * rhs);
+/// # use core::array;
+/// let a: [i32; 4] = [-2, 0, 2, 4];
+/// let b = [10, 9, 8, 7];
+/// let sum = array::from_fn(|i| a[i] + b[i]);
+/// let prod = array::from_fn(|i| a[i] * b[i]);
 ///
 /// // `Simd<T, N>` implements `From<[T; N]>
-/// let (v0, v1) = (Simd::from(a0), Simd::from(a1));
+/// let (v, w) = (Simd::from(a), Simd::from(b));
 /// // Which means arrays implement `Into<Simd<T, N>>`.
-/// assert_eq!(v0 + v1, zm_add.into());
-/// assert_eq!(v0 * v1, zm_mul.into());
+/// assert_eq!(v + w, sum.into());
+/// assert_eq!(v * w, prod.into());
 /// ```
 ///
-/// `Simd` with integers has the quirk that these operations are also inherently wrapping, as if `T` was [`Wrapping<T>`].
+///
+/// `Simd` with integer elements treats operators as wrapping, as if `T` was [`Wrapping<T>`].
 /// Thus, `Simd` does not implement `wrapping_add`, because that is the default behavior.
 /// This means there is no warning on overflows, even in "debug" builds.
 /// For most applications where `Simd` is appropriate, it is "not a bug" to wrap,
 /// and even "debug builds" are unlikely to tolerate the loss of performance.
 /// You may want to consider using explicitly checked arithmetic if such is required.
-/// Division by zero still causes a panic, so you may want to consider using floating point numbers if that is unacceptable.
+/// Division by zero on integers still causes a panic, so
+/// you may want to consider using `f32` or `f64` if that is unacceptable.
 ///
 /// [`Wrapping<T>`]: core::num::Wrapping
 ///
 /// # Layout
-/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), but with a greater alignment.
+/// `Simd<T, N>` has a layout similar to `[T; N]` (identical "shapes"), with a greater alignment.
 /// `[T; N]` is aligned to `T`, but `Simd<T, N>` will have an alignment based on both `T` and `N`.
-/// It is thus sound to [`transmute`] `Simd<T, N>` to `[T; N]`, and will typically optimize to zero cost,
-/// but the reverse transmutation is more likely to require a copy the compiler cannot simply elide.
+/// Thus it is sound to [`transmute`] `Simd<T, N>` to `[T; N]` and should optimize to "zero cost",
+/// but the reverse transmutation may require a copy the compiler cannot simply elide.
 ///
 /// # ABI "Features"
-/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed to and from functions via memory, not SIMD registers,
-/// except as an optimization. `#[inline]` hints are recommended on functions that accept `Simd<T, N>` or return it.
-/// The need for this may be corrected in the future.
+/// Due to Rust's safety guarantees, `Simd<T, N>` is currently passed and returned via memory,
+/// not SIMD registers, except as an optimization. Using `#[inline]` on functions that accept
+/// `Simd<T, N>` or return it is recommended, at the cost of code generation time, as
+/// inlining SIMD-using functions can omit a large function prolog or epilog and thus
+/// improve both speed and code size. The need for this may be corrected in the future.
+///
+/// Using `#[inline(always)]` still requires additional care.
 ///
 /// # Safe SIMD with Unsafe Rust
 ///
@@ -55,18 +62,22 @@ use crate::simd::{
 /// Thus, when using `unsafe` Rust to read and write `Simd<T, N>` through [raw pointers], it is a good idea to first try with
 /// [`read_unaligned`] and [`write_unaligned`]. This is because:
 /// - [`read`] and [`write`] require full alignment (in this case, `Simd<T, N>`'s alignment)
-/// - the likely source for reading or destination for writing `Simd<T, N>` is [`[T]`](slice) and similar types, aligned to `T`
-/// - combining these actions would violate the `unsafe` contract and explode the program into a puff of **undefined behavior**
-/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned if it sees the optimization
-/// - most contemporary processors suffer no performance penalty for "unaligned" reads and writes that are aligned at runtime
+/// - `Simd<T, N>` is often read from or written to [`[T]`](slice) and other types aligned to `T`
+/// - combining these actions violates the `unsafe` contract and explodes the program into
+///   a puff of **undefined behavior**
+/// - the compiler can implicitly adjust layouts to make unaligned reads or writes fully aligned
+///   if it sees the optimization
+/// - most contemporary processors with "aligned" and "unaligned" read and write instructions
+///   exhibit no performance difference if the "unaligned" variant is aligned at runtime
 ///
-/// By imposing less obligations, unaligned functions are less likely to make the program unsound,
+/// Less obligations mean unaligned reads and writes are less likely to make the program unsound,
 /// and may be just as fast as stricter alternatives.
-/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for converting `[T]` to `[Simd<T, N>]`,
-/// and allows soundly operating on an aligned SIMD body, but it may cost more time when handling the scalar head and tail.
-/// If these are not sufficient, then it is most ideal to design data structures to be already aligned
-/// to the `Simd<T, N>` you wish to use before using `unsafe` Rust to read or write.
-/// More conventional ways to compensate for these facts, like materializing `Simd` to or from an array first,
+/// When trying to guarantee alignment, [`[T]::as_simd`][as_simd] is an option for
+/// converting `[T]` to `[Simd<T, N>]`, and allows soundly operating on an aligned SIMD body,
+/// but it may cost more time when handling the scalar head and tail.
+/// If these are not enough, it is most ideal to design data structures to be already aligned
+/// to `mem::align_of::<Simd<T, N>>()` before using `unsafe` Rust to read or write.
+/// Other ways to compensate for these facts, like materializing `Simd` to or from an array first,
 /// are handled by safe methods like [`Simd::from_array`] and [`Simd::from_slice`].
 ///
 /// [`transmute`]: core::mem::transmute
@@ -82,20 +93,20 @@ use crate::simd::{
 // avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
 // causes rustc to emit illegal LLVM IR in some cases.
 #[repr(simd)]
-pub struct Simd<T, const LANES: usize>([T; LANES])
+pub struct Simd<T, const N: usize>([T; N])
 where
-    T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount;
+    LaneCount<N>: SupportedLaneCount,
+    T: SimdElement;
 
-impl<T, const LANES: usize> Simd<T, LANES>
+impl<T, const N: usize> Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    /// Number of lanes in this vector.
-    pub const LANES: usize = LANES;
+    /// Number of elements in this vector.
+    pub const N: usize = N;
 
-    /// Returns the number of lanes in this SIMD vector.
+    /// Returns the number of elements in this SIMD vector.
     ///
     /// # Examples
     ///
@@ -106,10 +117,10 @@ where
     /// assert_eq!(v.lanes(), 4);
     /// ```
     pub const fn lanes(&self) -> usize {
-        LANES
+        Self::N
     }
 
-    /// Constructs a new SIMD vector with all lanes set to the given value.
+    /// Constructs a new SIMD vector with all elements set to the given value.
     ///
     /// # Examples
     ///
@@ -120,11 +131,11 @@ where
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
     pub fn splat(value: T) -> Self {
-        // This is preferred over `[value; LANES]`, since it's explicitly a splat:
+        // This is preferred over `[value; N]`, since it's explicitly a splat:
         // https://github.com/rust-lang/rust/issues/97804
         struct Splat;
-        impl<const LANES: usize> Swizzle<1, LANES> for Splat {
-            const INDEX: [usize; LANES] = [0; LANES];
+        impl<const N: usize> Swizzle<1, N> for Splat {
+            const INDEX: [usize; N] = [0; N];
         }
         Splat::swizzle(Simd::<T, 1>::from([value]))
     }
@@ -139,30 +150,30 @@ where
     /// let v: u64x4 = Simd::from_array([0, 1, 2, 3]);
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
-    pub const fn as_array(&self) -> &[T; LANES] {
-        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
-        // is always valid and `Simd<T, LANES>` never has a lower alignment
-        // than `[T; LANES]`.
+    pub const fn as_array(&self) -> &[T; N] {
+        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
+        // is always valid and `Simd<T, N>` never has a lower alignment
+        // than `[T; N]`.
         //
         // NOTE: This deliberately doesn't just use `&self.0`, see the comment
         // on the struct definition for details.
-        unsafe { &*(self as *const Self as *const [T; LANES]) }
+        unsafe { &*(self as *const Self as *const [T; N]) }
     }
 
     /// Returns a mutable array reference containing the entire SIMD vector.
-    pub fn as_mut_array(&mut self) -> &mut [T; LANES] {
-        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
-        // is always valid and `Simd<T, LANES>` never has a lower alignment
-        // than `[T; LANES]`.
+    pub fn as_mut_array(&mut self) -> &mut [T; N] {
+        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
+        // is always valid and `Simd<T, N>` never has a lower alignment
+        // than `[T; N]`.
         //
         // NOTE: This deliberately doesn't just use `&mut self.0`, see the comment
         // on the struct definition for details.
-        unsafe { &mut *(self as *mut Self as *mut [T; LANES]) }
+        unsafe { &mut *(self as *mut Self as *mut [T; N]) }
     }
 
     /// Converts an array to a SIMD vector.
-    pub const fn from_array(array: [T; LANES]) -> Self {
-        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+    pub const fn from_array(array: [T; N]) -> Self {
+        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
         // is always valid. We need to use `read_unaligned` here, since
         // the array may have a lower alignment than the vector.
         //
@@ -172,12 +183,12 @@ where
         //
         // NOTE: This deliberately doesn't just use `Self(array)`, see the comment
         // on the struct definition for details.
-        unsafe { (&array as *const [T; LANES] as *const Self).read_unaligned() }
+        unsafe { (&array as *const [T; N] as *const Self).read_unaligned() }
     }
 
     /// Converts a SIMD vector to an array.
-    pub const fn to_array(self) -> [T; LANES] {
-        // SAFETY: Transmuting between `Simd<T, LANES>` and `[T; LANES]`
+    pub const fn to_array(self) -> [T; N] {
+        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
         // is always valid. No need to use `read_unaligned` here, since
         // the vector never has a lower alignment than the array.
         //
@@ -187,14 +198,14 @@ where
         //
         // NOTE: This deliberately doesn't just use `self.0`, see the comment
         // on the struct definition for details.
-        unsafe { (&self as *const Self as *const [T; LANES]).read() }
+        unsafe { (&self as *const Self as *const [T; N]).read() }
     }
 
-    /// Converts a slice to a SIMD vector containing `slice[..LANES]`.
+    /// Converts a slice to a SIMD vector containing `slice[..N]`.
     ///
     /// # Panics
     ///
-    /// Panics if the slice's length is less than the vector's `Simd::LANES`.
+    /// Panics if the slice's length is less than the vector's `Simd::N`.
     ///
     /// # Example
     ///
@@ -208,21 +219,21 @@ where
     #[must_use]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(
-            slice.len() >= LANES,
-            "slice length must be at least the number of lanes"
+            slice.len() >= Self::N,
+            "slice length must be at least the number of elements"
         );
-        assert!(core::mem::size_of::<Self>() == LANES * core::mem::size_of::<T>());
+        assert!(core::mem::size_of::<Self>() == Self::N * core::mem::size_of::<T>());
         // Safety:
         // - We've checked the length is sufficient.
         // - `T` and `Simd<T, N>` are Copy types.
         unsafe { slice.as_ptr().cast::<Self>().read_unaligned() }
     }
 
-    /// Writes a SIMD vector to the first `LANES` elements of a slice.
+    /// Writes a SIMD vector to the first `N` elements of a slice.
     ///
     /// # Panics
     ///
-    /// Panics if the slice's length is less than the vector's `Simd::LANES`.
+    /// Panics if the slice's length is less than the vector's `Simd::N`.
     ///
     /// # Example
     ///
@@ -238,22 +249,22 @@ where
     /// ```
     pub fn copy_to_slice(self, slice: &mut [T]) {
         assert!(
-            slice.len() >= LANES,
-            "slice length must be at least the number of lanes"
+            slice.len() >= Self::N,
+            "slice length must be at least the number of elements"
         );
-        assert!(core::mem::size_of::<Self>() == LANES * core::mem::size_of::<T>());
+        assert!(core::mem::size_of::<Self>() == Self::N * core::mem::size_of::<T>());
         // Safety:
         // - We've checked the length is sufficient
         // - `T` and `Simd<T, N>` are Copy types.
         unsafe { slice.as_mut_ptr().cast::<Self>().write_unaligned(self) }
     }
 
-    /// Performs lanewise conversion of a SIMD vector's elements to another SIMD-valid type.
+    /// Performs elementwise conversion of a SIMD vector's elements to another SIMD-valid type.
     ///
-    /// This follows the semantics of Rust's `as` conversion for casting
-    /// integers to unsigned integers (interpreting as the other type, so `-1` to `MAX`),
-    /// and from floats to integers (truncating, or saturating at the limits) for each lane,
-    /// or vice versa.
+    /// This follows the semantics of Rust's `as` conversion for casting integers between
+    /// signed and unsigned (interpreting integers as 2s complement, so `-1` to `U::MAX` and
+    /// `1 << (U::BITS -1)` becoming `I::MIN` ), and from floats to integers (truncating,
+    /// or saturating at the limits) for each element.
     ///
     /// # Examples
     /// ```
@@ -274,7 +285,7 @@ where
     #[must_use]
     #[inline]
     #[cfg(not(bootstrap))]
-    pub fn cast<U: SimdCast>(self) -> Simd<U, LANES>
+    pub fn cast<U: SimdCast>(self) -> Simd<U, N>
     where
         T: SimdCast,
     {
@@ -282,10 +293,10 @@ where
         unsafe { intrinsics::simd_as(self) }
     }
 
-    /// Lanewise casts pointers to another pointer type.
+    /// Casts a vector of pointers to another pointer type.
     #[must_use]
     #[inline]
-    pub fn cast_ptr<U>(self) -> Simd<U, LANES>
+    pub fn cast_ptr<U>(self) -> Simd<U, N>
     where
         T: SimdCastPtr<U>,
         U: SimdElement,
@@ -310,7 +321,7 @@ where
     /// [cast]: Simd::cast
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, LANES>
+    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, N>
     where
         T: core::convert::FloatToInt<I> + SimdCast,
         I: SimdCast,
@@ -320,79 +331,79 @@ where
     }
 
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
+    /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);  // Note the index that is out-of-bounds
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
     ///
-    /// let result = Simd::gather_or(&vec, idxs, alt); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_or(&vec, idxs, alt);
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, 15]));
     /// ```
     #[must_use]
     #[inline]
-    pub fn gather_or(slice: &[T], idxs: Simd<usize, LANES>, or: Self) -> Self {
+    pub fn gather_or(slice: &[T], idxs: Simd<usize, N>, or: Self) -> Self {
         Self::gather_select(slice, Mask::splat(true), idxs, or)
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// If an index is out-of-bounds, the lane is set to the default value for the type.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// If an index is out-of-bounds, the element is set to the default given by `T: Default`.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);  // Note the index that is out-of-bounds
     ///
-    /// let result = Simd::gather_or_default(&vec, idxs); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_or_default(&vec, idxs);
     /// assert_eq!(result, Simd::from_array([0, 13, 10, 15]));
     /// ```
     #[must_use]
     #[inline]
-    pub fn gather_or_default(slice: &[T], idxs: Simd<usize, LANES>) -> Self
+    pub fn gather_or_default(slice: &[T], idxs: Simd<usize, N>) -> Self
     where
         T: Default,
     {
         Self::gather_or(slice, idxs, Self::splat(T::default()))
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an index is disabled or is out-of-bounds, the lane is selected from the `or` vector.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an index is disabled or is out-of-bounds, the element is selected from the `or` vector.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::{Simd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]); // Includes an out-of-bounds index
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     ///
-    /// let result = Simd::gather_select(&vec, enable, idxs, alt); // Note the lane that is out-of-bounds.
+    /// let result = Simd::gather_select(&vec, enable, idxs, alt);
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
     /// ```
     #[must_use]
     #[inline]
     pub fn gather_select(
         slice: &[T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
         or: Self,
     ) -> Self {
-        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
-        // Safety: We have masked-off out-of-bounds lanes.
+        let enable: Mask<isize, N> = enable & idxs.simd_lt(Simd::splat(slice.len()));
+        // Safety: We have masked-off out-of-bounds indices.
         unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
     }
 
-    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an index is disabled, the lane is selected from the `or` vector.
+    /// Reads from indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an index is disabled, the element is selected from the `or` vector.
     ///
     /// # Safety
     ///
@@ -406,13 +417,13 @@ where
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::{Simd, SimdPartialOrd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let idxs = Simd::from_array([9, 3, 0, 5]); // Includes an out-of-bounds index
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the final mask lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     /// // If this mask was used to gather, it would be unsound. Let's fix that.
     /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
-    /// // We have masked the OOB lane, so it's safe to gather now.
+    /// // The out-of-bounds index has been masked, so it's safe to gather now.
     /// let result = unsafe { Simd::gather_select_unchecked(&vec, enable, idxs, alt) };
     /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
     /// ```
@@ -422,18 +433,18 @@ where
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn gather_select_unchecked(
         slice: &[T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
         or: Self,
     ) -> Self {
-        let base_ptr = Simd::<*const T, LANES>::splat(slice.as_ptr());
+        let base_ptr = Simd::<*const T, N>::splat(slice.as_ptr());
         // Ferris forgive me, I have done pointer arithmetic here.
         let ptrs = base_ptr.wrapping_add(idxs);
         // Safety: The caller is responsible for determining the indices are okay to read
         unsafe { Self::gather_select_ptr(ptrs, enable, or) }
     }
 
-    /// Read pointers elementwise into a SIMD vector.
+    /// Read elementwise from pointers into a SIMD vector.
     ///
     /// # Safety
     ///
@@ -454,7 +465,7 @@ where
     #[must_use]
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn gather_ptr(source: Simd<*const T, LANES>) -> Self
+    pub unsafe fn gather_ptr(source: Simd<*const T, N>) -> Self
     where
         T: Default,
     {
@@ -463,13 +474,14 @@ where
         unsafe { Self::gather_select_ptr(source, Mask::splat(true), Self::default()) }
     }
 
-    /// Conditionally read pointers elementwise into a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If a lane is disabled, the lane is selected from the `or` vector and no read is performed.
+    /// Conditionally read elementwise from pointers into a SIMD vector.
+    /// The mask `enable`s all `true` pointers and disables all `false` pointers.
+    /// If a pointer is disabled, the element is selected from the `or` vector,
+    /// and no read is performed.
     ///
     /// # Safety
     ///
-    /// Enabled lanes must satisfy the same conditions as [`core::ptr::read`].
+    /// Enabled elements must satisfy the same conditions as [`core::ptr::read`].
     ///
     /// # Example
     /// ```
@@ -488,8 +500,8 @@ where
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn gather_select_ptr(
-        source: Simd<*const T, LANES>,
-        enable: Mask<isize, LANES>,
+        source: Simd<*const T, N>,
+        enable: Mask<isize, N>,
         or: Self,
     ) -> Self {
         // Safety: The caller is responsible for upholding all invariants
@@ -497,30 +509,31 @@ where
     }
 
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
-    /// If two lanes in the scattered vector would write to the same index
-    /// only the last lane is guaranteed to actually be written.
+    /// If an index is out-of-bounds, the write is suppressed without panicking.
+    /// If two elements in the scattered vector would write to the same index
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # use core::simd::Simd;
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let idxs = Simd::from_array([9, 3, 0, 0]); // Note the duplicate index.
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
     ///
-    /// vals.scatter(&mut vec, idxs); // index 0 receives two writes.
+    /// vals.scatter(&mut vec, idxs); // two logical writes means the last wins.
     /// assert_eq!(vec, vec![124, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     #[inline]
-    pub fn scatter(self, slice: &mut [T], idxs: Simd<usize, LANES>) {
+    pub fn scatter(self, slice: &mut [T], idxs: Simd<usize, N>) {
         self.scatter_select(slice, Mask::splat(true), idxs)
     }
 
-    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If an enabled index is out-of-bounds, the lane is not written.
-    /// If two enabled lanes in the scattered vector would write to the same index,
-    /// only the last lane is guaranteed to actually be written.
+    /// Writes values from a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If an enabled index is out-of-bounds, the write is suppressed without panicking.
+    /// If two enabled elements in the scattered vector would write to the same index,
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Examples
     /// ```
@@ -529,29 +542,24 @@ where
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::{Simd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
-    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let idxs = Simd::from_array([9, 3, 0, 0]); // Includes an out-of-bounds index
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Includes a masked element
     ///
-    /// vals.scatter_select(&mut vec, enable, idxs); // index 0's second write is masked, thus omitted.
+    /// vals.scatter_select(&mut vec, enable, idxs); // The last write is masked, thus omitted.
     /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     #[inline]
-    pub fn scatter_select(
-        self,
-        slice: &mut [T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
-    ) {
-        let enable: Mask<isize, LANES> = enable & idxs.simd_lt(Simd::splat(slice.len()));
-        // Safety: We have masked-off out-of-bounds lanes.
+    pub fn scatter_select(self, slice: &mut [T], enable: Mask<isize, N>, idxs: Simd<usize, N>) {
+        let enable: Mask<isize, N> = enable & idxs.simd_lt(Simd::splat(slice.len()));
+        // Safety: We have masked-off out-of-bounds indices.
         unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
     }
 
-    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If two enabled lanes in the scattered vector would write to the same index,
-    /// only the last lane is guaranteed to actually be written.
+    /// Writes values from a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` indices and disables all `false` indices.
+    /// If two enabled elements in the scattered vector would write to the same index,
+    /// only the last element is guaranteed to actually be written.
     ///
     /// # Safety
     ///
@@ -567,13 +575,13 @@ where
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
-    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// let enable = Mask::from_array([true, true, true, false]); // Masks the final index
     /// // If this mask was used to scatter, it would be unsound. Let's fix that.
     /// let enable = enable & idxs.simd_lt(Simd::splat(vec.len()));
     ///
-    /// // We have masked the OOB lane, so it's safe to scatter now.
+    /// // We have masked the OOB index, so it's safe to scatter now.
     /// unsafe { vals.scatter_select_unchecked(&mut vec, enable, idxs); }
-    /// // index 0's second write is masked, thus was omitted.
+    /// // The second write to index 0 was masked, thus omitted.
     /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
     /// ```
     /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
@@ -582,8 +590,8 @@ where
     pub unsafe fn scatter_select_unchecked(
         self,
         slice: &mut [T],
-        enable: Mask<isize, LANES>,
-        idxs: Simd<usize, LANES>,
+        enable: Mask<isize, N>,
+        idxs: Simd<usize, N>,
     ) {
         // Safety: This block works with *mut T derived from &mut 'a [T],
         // which means it is delicate in Rust's borrowing model, circa 2021:
@@ -597,7 +605,7 @@ where
         // 3. &mut [T] which will become our base ptr.
         unsafe {
             // Now Entering ☢️ *mut T Zone
-            let base_ptr = Simd::<*mut T, LANES>::splat(slice.as_mut_ptr());
+            let base_ptr = Simd::<*mut T, N>::splat(slice.as_mut_ptr());
             // Ferris forgive me, I have done pointer arithmetic here.
             let ptrs = base_ptr.wrapping_add(idxs);
             // The ptrs have been bounds-masked to prevent memory-unsafe writes insha'allah
@@ -626,18 +634,18 @@ where
     /// ```
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn scatter_ptr(self, dest: Simd<*mut T, LANES>) {
+    pub unsafe fn scatter_ptr(self, dest: Simd<*mut T, N>) {
         // Safety: The caller is responsible for upholding all invariants
         unsafe { self.scatter_select_ptr(dest, Mask::splat(true)) }
     }
 
     /// Conditionally write pointers elementwise into a SIMD vector.
-    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
-    /// If a lane is disabled, the write to that lane is skipped.
+    /// The mask `enable`s all `true` pointers and disables all `false` pointers.
+    /// If a pointer is disabled, the write to its pointee is skipped.
     ///
     /// # Safety
     ///
-    /// Enabled lanes must satisfy the same conditions as [`core::ptr::write`].
+    /// Enabled pointers must satisfy the same conditions as [`core::ptr::write`].
     ///
     /// # Example
     /// ```
@@ -654,32 +662,32 @@ where
     /// ```
     #[inline]
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, LANES>, enable: Mask<isize, LANES>) {
+    pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, N>, enable: Mask<isize, N>) {
         // Safety: The caller is responsible for upholding all invariants
         unsafe { intrinsics::simd_scatter(self, dest, enable.to_int()) }
     }
 }
 
-impl<T, const LANES: usize> Copy for Simd<T, LANES>
+impl<T, const N: usize> Copy for Simd<T, N>
 where
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Clone for Simd<T, LANES>
+impl<T, const N: usize> Clone for Simd<T, N>
 where
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
 {
     fn clone(&self) -> Self {
         *self
     }
 }
 
-impl<T, const LANES: usize> Default for Simd<T, LANES>
+impl<T, const N: usize> Default for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Default,
 {
     #[inline]
@@ -688,20 +696,20 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialEq for Simd<T, LANES>
+impl<T, const N: usize> PartialEq for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + PartialEq,
 {
     #[inline]
     fn eq(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_eq(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_eq(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are equal if all lanes tested true for vertical equality.
+        // Two vectors are equal if they are elementwise equal
         mask.all()
     }
 
@@ -710,18 +718,18 @@ where
     fn ne(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, LANES> = intrinsics::simd_ne(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_ne(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are non-equal if any lane tested true for vertical non-equality.
+        // Two vectors are non-equal if they are elementwise non-equal
         mask.any()
     }
 }
 
-impl<T, const LANES: usize> PartialOrd for Simd<T, LANES>
+impl<T, const N: usize> PartialOrd for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + PartialOrd,
 {
     #[inline]
@@ -731,16 +739,16 @@ where
     }
 }
 
-impl<T, const LANES: usize> Eq for Simd<T, LANES>
+impl<T, const N: usize> Eq for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Eq,
 {
 }
 
-impl<T, const LANES: usize> Ord for Simd<T, LANES>
+impl<T, const N: usize> Ord for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + Ord,
 {
     #[inline]
@@ -750,9 +758,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::hash::Hash for Simd<T, LANES>
+impl<T, const N: usize> core::hash::Hash for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + core::hash::Hash,
 {
     #[inline]
@@ -765,32 +773,32 @@ where
 }
 
 // array references
-impl<T, const LANES: usize> AsRef<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> AsRef<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
-    fn as_ref(&self) -> &[T; LANES] {
+    fn as_ref(&self) -> &[T; N] {
         self.as_array()
     }
 }
 
-impl<T, const LANES: usize> AsMut<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> AsMut<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
-    fn as_mut(&mut self) -> &mut [T; LANES] {
+    fn as_mut(&mut self) -> &mut [T; N] {
         self.as_mut_array()
     }
 }
 
 // slice references
-impl<T, const LANES: usize> AsRef<[T]> for Simd<T, LANES>
+impl<T, const N: usize> AsRef<[T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
@@ -799,9 +807,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> AsMut<[T]> for Simd<T, LANES>
+impl<T, const N: usize> AsMut<[T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     #[inline]
@@ -811,29 +819,29 @@ where
 }
 
 // vector/array conversion
-impl<T, const LANES: usize> From<[T; LANES]> for Simd<T, LANES>
+impl<T, const N: usize> From<[T; N]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    fn from(array: [T; LANES]) -> Self {
+    fn from(array: [T; N]) -> Self {
         Self(array)
     }
 }
 
-impl<T, const LANES: usize> From<Simd<T, LANES>> for [T; LANES]
+impl<T, const N: usize> From<Simd<T, N>> for [T; N]
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
-    fn from(vector: Simd<T, LANES>) -> Self {
+    fn from(vector: Simd<T, N>) -> Self {
         vector.to_array()
     }
 }
 
-impl<T, const LANES: usize> TryFrom<&[T]> for Simd<T, LANES>
+impl<T, const N: usize> TryFrom<&[T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     type Error = core::array::TryFromSliceError;
@@ -843,9 +851,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> TryFrom<&mut [T]> for Simd<T, LANES>
+impl<T, const N: usize> TryFrom<&mut [T]> for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
     type Error = core::array::TryFromSliceError;

From 92259a4a6c20b02e87e0589a286bef7b71cd95a9 Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Mon, 10 Apr 2023 00:11:37 -0700
Subject: [PATCH 57/70] Clarify elementwise cmp reduces

Saying "elementwise (non-)equal" may suggest it returns a vector.
The comments should be clear that it instead reduces to a scalar.

Co-authored-by: Jacob Lifshay <programmerjake@gmail.com>
---
 crates/core_simd/src/vector.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 154b467752b6d..b7b5e0b002f9b 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -709,7 +709,7 @@ where
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are equal if they are elementwise equal
+        // Two vectors are equal if all elements are equal when compared elementwise
         mask.all()
     }
 
@@ -722,7 +722,7 @@ where
             Mask::from_int_unchecked(tfvec)
         };
 
-        // Two vectors are non-equal if they are elementwise non-equal
+        // Two vectors are non-equal if any elements are non-equal when compared elementwise
         mask.any()
     }
 }

From 4064678dafd3907253353a1efc01bc0ada78c1bc Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Mon, 10 Apr 2023 22:06:01 -0700
Subject: [PATCH 58/70] Explain why to use Simd early

---
 crates/core_simd/src/vector.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index b7b5e0b002f9b..ef67fcfeee605 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -8,7 +8,12 @@ use crate::simd::{
 /// `Simd<T, N>` supports the operators (+, *, etc.) that `T` does in "elementwise" fashion.
 /// These take the element at each index from the left-hand side and right-hand side,
 /// perform the operation, then return the result in the same index in a vector of equal size.
-/// In other words, an elementwise operation is equivalent to a zip, then map.
+/// However, `Simd` differs from normal iteration and normal arrays:
+/// - `Simd<T, N>` executes `N` operations in a single step with no `break`s
+/// - `Simd<T, N>` can have an alignment greater than `T`, for better mechanical sympathy
+///
+/// By always imposing these constraints on `Simd`, it is easier to compile elementwise operations
+/// into machine instructions that can themselves be executed in parallel.
 ///
 /// ```rust
 /// # #![feature(portable_simd)]

From 2b32732d0f64a27560c9c4ca15e89bc454c482da Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sat, 22 Apr 2023 18:22:04 -0700
Subject: [PATCH 59/70] Do not construct Simd

---
 crates/core_simd/src/vector.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index ef67fcfeee605..106f1965959ac 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -830,7 +830,7 @@ where
     T: SimdElement,
 {
     fn from(array: [T; N]) -> Self {
-        Self(array)
+        Self::from_array(array)
     }
 }
 

From 4f0d8225fa6e503ba785da93b0ab900d597af133 Mon Sep 17 00:00:00 2001
From: Jubilee <46493976+workingjubilee@users.noreply.github.com>
Date: Sat, 22 Apr 2023 18:27:52 -0700
Subject: [PATCH 60/70] Implement dynamic byte-swizzle prototype
 (rust-lang/portable-simd#334)

This is meant to be an example that is used to test
a Rust intrinsic against, which will replace it.
The interface is fairly direct and doesn't address
more nuanced or interesting permutations one can do,
nevermind on types other than bytes.

The ultimate goal is for direct LLVM support for this.
---
 crates/core_simd/src/mod.rs           |   2 +
 crates/core_simd/src/swizzle_dyn.rs   | 155 ++++++++++++++++++++++++++
 crates/core_simd/tests/swizzle_dyn.rs |  74 ++++++++++++
 3 files changed, 231 insertions(+)
 create mode 100644 crates/core_simd/src/swizzle_dyn.rs
 create mode 100644 crates/core_simd/tests/swizzle_dyn.rs

diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index ece026a448b73..35c659b7a429a 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -17,6 +17,7 @@ mod masks;
 mod ops;
 mod ord;
 mod select;
+mod swizzle_dyn;
 mod vector;
 mod vendor;
 
@@ -32,5 +33,6 @@ pub mod simd {
     pub use crate::core_simd::masks::*;
     pub use crate::core_simd::ord::*;
     pub use crate::core_simd::swizzle::*;
+    pub use crate::core_simd::swizzle_dyn::*;
     pub use crate::core_simd::vector::*;
 }
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
new file mode 100644
index 0000000000000..5c3a2c1824ff4
--- /dev/null
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -0,0 +1,155 @@
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::mem;
+
+impl<const N: usize> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Swizzle a vector of bytes according to the index vector.
+    /// Indices within range select the appropriate byte.
+    /// Indices "out of bounds" instead select 0.
+    ///
+    /// Note that the current implementation is selected during build-time
+    /// of the standard library, so `cargo build -Zbuild-std` may be necessary
+    /// to unlock better performance, especially for larger vectors.
+    /// A planned compiler improvement will enable using `#[target_feature]` instead.
+    #[inline]
+    pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
+        #![allow(unused_imports, unused_unsafe)]
+        #[cfg(target_arch = "aarch64")]
+        use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
+        #[cfg(all(target_arch = "arm", target_feature = "v7"))]
+        use core::arch::arm::{uint8x8_t, vtbl1_u8};
+        #[cfg(target_arch = "wasm32")]
+        use core::arch::wasm32 as wasm;
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64 as x86;
+        // SAFETY: Intrinsics covered by cfg
+        unsafe {
+            match N {
+                #[cfg(target_feature = "neon")]
+                8 => transize(vtbl1_u8, self, idxs),
+                #[cfg(target_feature = "ssse3")]
+                16 => transize(x86::_mm_shuffle_epi8, self, idxs),
+                #[cfg(target_feature = "simd128")]
+                16 => transize(wasm::i8x16_swizzle, self, idxs),
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
+                32 => transize_raw(avx2_pshufb, self, idxs),
+                #[cfg(target_feature = "avx512vl,avx512vbmi")]
+                32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
+                // Notable absence: avx512bw shuffle
+                // If avx512bw is available, odds of avx512vbmi are good
+                #[cfg(target_feature = "avx512vbmi")]
+                64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                _ => {
+                    let mut array = [0; N];
+                    for (i, k) in idxs.to_array().into_iter().enumerate() {
+                        if (k as usize) < N {
+                            array[i] = self[k as usize];
+                        };
+                    }
+                    array.into()
+                }
+            }
+        }
+    }
+}
+
+/// "vpshufb like it was meant to be" on AVX2
+///
+/// # Safety
+/// This requires AVX2 to work
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+#[allow(unused)]
+#[inline]
+unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
+    use crate::simd::SimdPartialOrd;
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64 as x86;
+    use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
+    use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
+    let mid = Simd::splat(16u8);
+    let high = mid + mid;
+    // SAFETY: Caller promised AVX2
+    unsafe {
+        // This is ordering sensitive, and LLVM will order these how you put them.
+        // Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
+        // But the "compose" step will lower to ops that can also use at least 1 other port.
+        // So this tries to break up permutes so composition flows through "open" ports.
+        // Comparative benches should be done on multiple AVX2 CPUs before reordering this
+
+        let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
+        let hi_shuf = Simd::from(avx2_half_pshufb(
+            hihi,        // duplicate the vector's top half
+            idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31
+        ));
+        // A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
+        let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
+        let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
+        let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
+        // Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
+        let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
+        compose
+    }
+}
+
+/// This sets up a call to an architecture-specific function, and in doing so
+/// it persuades rustc that everything is the correct size. Which it is.
+/// This would not be needed if one could convince Rust that, by matching on N,
+/// N is that value, and thus it would be valid to substitute e.g. 16.
+///
+/// # Safety
+/// The correctness of this function hinges on the sizes agreeing in actuality.
+#[allow(dead_code)]
+#[inline(always)]
+unsafe fn transize<T, const N: usize>(
+    f: unsafe fn(T, T) -> T,
+    bytes: Simd<u8, N>,
+    idxs: Simd<u8, N>,
+) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    let idxs = zeroing_idxs(idxs);
+    // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
+    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+}
+
+/// Make indices that yield 0 for this architecture
+#[inline(always)]
+fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    // On x86, make sure the top bit is set.
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    let idxs = {
+        use crate::simd::SimdPartialOrd;
+        idxs.simd_lt(Simd::splat(N as u8))
+            .select(idxs, Simd::splat(u8::MAX))
+    };
+    // Simply do nothing on most architectures.
+    idxs
+}
+
+/// As transize but no implicit call to `zeroing_idxs`.
+#[allow(dead_code)]
+#[inline(always)]
+unsafe fn transize_raw<T, const N: usize>(
+    f: unsafe fn(T, T) -> T,
+    bytes: Simd<u8, N>,
+    idxs: Simd<u8, N>,
+) -> Simd<u8, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
+    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+}
diff --git a/crates/core_simd/tests/swizzle_dyn.rs b/crates/core_simd/tests/swizzle_dyn.rs
new file mode 100644
index 0000000000000..646cd5f338335
--- /dev/null
+++ b/crates/core_simd/tests/swizzle_dyn.rs
@@ -0,0 +1,74 @@
+#![feature(portable_simd)]
+use core::{fmt, ops::RangeInclusive};
+use proptest;
+use test_helpers::{self, biteq, make_runner, prop_assert_biteq};
+
+fn swizzle_dyn_scalar_ver<const N: usize>(values: [u8; N], idxs: [u8; N]) -> [u8; N] {
+    let mut array = [0; N];
+    for (i, k) in idxs.into_iter().enumerate() {
+        if (k as usize) < N {
+            array[i] = values[k as usize];
+        };
+    }
+    array
+}
+
+test_helpers::test_lanes! {
+    fn swizzle_dyn<const N: usize>() {
+        match_simd_with_fallback(
+            &core_simd::simd::Simd::<u8, N>::swizzle_dyn,
+            &swizzle_dyn_scalar_ver,
+            &|_, _| true,
+        );
+    }
+}
+
+fn match_simd_with_fallback<Scalar, ScalarResult, Vector, VectorResult, const N: usize>(
+    fv: &dyn Fn(Vector, Vector) -> VectorResult,
+    fs: &dyn Fn([Scalar; N], [Scalar; N]) -> [ScalarResult; N],
+    check: &dyn Fn([Scalar; N], [Scalar; N]) -> bool,
+) where
+    Scalar: Copy + fmt::Debug + SwizzleStrategy,
+    ScalarResult: Copy + biteq::BitEq + fmt::Debug + SwizzleStrategy,
+    Vector: Into<[Scalar; N]> + From<[Scalar; N]> + Copy,
+    VectorResult: Into<[ScalarResult; N]> + From<[ScalarResult; N]> + Copy,
+{
+    test_swizzles_2(&|x: [Scalar; N], y: [Scalar; N]| {
+        proptest::prop_assume!(check(x, y));
+        let result_v: [ScalarResult; N] = fv(x.into(), y.into()).into();
+        let result_s: [ScalarResult; N] = fs(x, y);
+        crate::prop_assert_biteq!(result_v, result_s);
+        Ok(())
+    });
+}
+
+fn test_swizzles_2<A: fmt::Debug + SwizzleStrategy, B: fmt::Debug + SwizzleStrategy>(
+    f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult,
+) {
+    let mut runner = make_runner();
+    runner
+        .run(
+            &(A::swizzled_strategy(), B::swizzled_strategy()),
+            |(a, b)| f(a, b),
+        )
+        .unwrap();
+}
+
+pub trait SwizzleStrategy {
+    type Strategy: proptest::strategy::Strategy<Value = Self>;
+    fn swizzled_strategy() -> Self::Strategy;
+}
+
+impl SwizzleStrategy for u8 {
+    type Strategy = RangeInclusive<u8>;
+    fn swizzled_strategy() -> Self::Strategy {
+        0..=64
+    }
+}
+
+impl<T: fmt::Debug + SwizzleStrategy, const N: usize> SwizzleStrategy for [T; N] {
+    type Strategy = test_helpers::array::UniformArrayStrategy<T::Strategy, Self>;
+    fn swizzled_strategy() -> Self::Strategy {
+        Self::Strategy::new(T::swizzled_strategy())
+    }
+}

From 394a8845c699b5c6b47c6a17e2926a549f8801be Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 23 Apr 2023 14:52:38 -0400
Subject: [PATCH 61/70] Fix {to,from}_array UB when repr(simd) produces padding

---
 crates/core_simd/src/lib.rs    |  2 ++
 crates/core_simd/src/vector.rs | 56 +++++++++++++++++++++++++---------
 2 files changed, 44 insertions(+), 14 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index e054d483ca5d2..31e7a3617bc59 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -2,6 +2,8 @@
 #![feature(
     const_ptr_read,
     const_refs_to_cell,
+    const_maybe_uninit_as_mut_ptr,
+    const_mut_refs,
     convert_float_to_int,
     decl_macro,
     intra_doc_pointers,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 106f1965959ac..8c6c703608142 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -176,34 +176,62 @@ where
         unsafe { &mut *(self as *mut Self as *mut [T; N]) }
     }
 
+    /// Load a vector from an array of `T`.
+    ///
+    /// This function is necessary since `repr(simd)` has padding for non-power-of-2 vectors (at the time of writing).
+    /// With padding, `read_unaligned` will read past the end of an array of N elements.
+    ///
+    /// # Safety
+    /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
+    const unsafe fn load(ptr: *const [T; N]) -> Self {
+        let mut tmp = core::mem::MaybeUninit::uninit();
+        // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  It may have padding
+        // which does not need to be initialized.  The safety of reading `ptr` is ensured by the
+        // caller.
+        unsafe {
+            core::ptr::copy_nonoverlapping(ptr, tmp.as_mut_ptr() as *mut _, 1);
+            tmp.assume_init()
+        }
+    }
+
+    /// Store a vector to an array of `T`.
+    ///
+    /// See `load` as to why this function is necessary.
+    ///
+    /// # Safety
+    /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
+    const unsafe fn store(self, ptr: *mut [T; N]) {
+        // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  The safety of writing
+        // `ptr` is ensured by the caller.
+        unsafe { core::ptr::copy_nonoverlapping(self.as_array(), ptr, 1) }
+    }
+
     /// Converts an array to a SIMD vector.
     pub const fn from_array(array: [T; N]) -> Self {
-        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
-        // is always valid. We need to use `read_unaligned` here, since
-        // the array may have a lower alignment than the vector.
+        // SAFETY: `&array` is safe to read.
         //
-        // FIXME: We currently use a pointer read instead of `transmute_copy` because
-        // it results in better codegen with optimizations disabled, but we should
-        // probably just use `transmute` once that works on const generic types.
+        // FIXME: We currently use a pointer load instead of `transmute_copy` because `repr(simd)`
+        // results in padding for non-power-of-2 vectors (so vectors are larger than arrays).
         //
         // NOTE: This deliberately doesn't just use `Self(array)`, see the comment
         // on the struct definition for details.
-        unsafe { (&array as *const [T; N] as *const Self).read_unaligned() }
+        unsafe { Self::load(&array) }
     }
 
     /// Converts a SIMD vector to an array.
     pub const fn to_array(self) -> [T; N] {
-        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
-        // is always valid. No need to use `read_unaligned` here, since
-        // the vector never has a lower alignment than the array.
+        let mut tmp = core::mem::MaybeUninit::uninit();
+        // SAFETY: writing to `tmp` is safe and initializes it.
         //
-        // FIXME: We currently use a pointer read instead of `transmute_copy` because
-        // it results in better codegen with optimizations disabled, but we should
-        // probably just use `transmute` once that works on const generic types.
+        // FIXME: We currently use a pointer store instead of `transmute_copy` because `repr(simd)`
+        // results in padding for non-power-of-2 vectors (so vectors are larger than arrays).
         //
         // NOTE: This deliberately doesn't just use `self.0`, see the comment
         // on the struct definition for details.
-        unsafe { (&self as *const Self as *const [T; N]).read() }
+        unsafe {
+            self.store(tmp.as_mut_ptr());
+            tmp.assume_init()
+        }
     }
 
     /// Converts a slice to a SIMD vector containing `slice[..N]`.

From c504f01abeba606a5fa7d081ed8aec25d118a486 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Tue, 25 Apr 2023 21:37:04 -0400
Subject: [PATCH 62/70] Use cast and improve comments

---
 crates/core_simd/src/vector.rs | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 8c6c703608142..92984f55e45a5 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -184,12 +184,15 @@ where
     /// # Safety
     /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
     const unsafe fn load(ptr: *const [T; N]) -> Self {
-        let mut tmp = core::mem::MaybeUninit::uninit();
+        // There are potentially simpler ways to write this function, but this should result in
+        // LLVM `load <N x T>`
+
+        let mut tmp = core::mem::MaybeUninit::<Self>::uninit();
         // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  It may have padding
         // which does not need to be initialized.  The safety of reading `ptr` is ensured by the
         // caller.
         unsafe {
-            core::ptr::copy_nonoverlapping(ptr, tmp.as_mut_ptr() as *mut _, 1);
+            core::ptr::copy_nonoverlapping(ptr, tmp.as_mut_ptr().cast(), 1);
             tmp.assume_init()
         }
     }
@@ -201,9 +204,14 @@ where
     /// # Safety
     /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
     const unsafe fn store(self, ptr: *mut [T; N]) {
+        // There are potentially simpler ways to write this function, but this should result in
+        // LLVM `store <N x T>`
+
+        // Creating a temporary helps LLVM turn the memcpy into a store.
+        let tmp = self;
         // SAFETY: `Simd<T, N>` always contains `N` elements of type `T`.  The safety of writing
         // `ptr` is ensured by the caller.
-        unsafe { core::ptr::copy_nonoverlapping(self.as_array(), ptr, 1) }
+        unsafe { core::ptr::copy_nonoverlapping(tmp.as_array(), ptr, 1) }
     }
 
     /// Converts an array to a SIMD vector.

From 4967f25f6bf930a5f79d5c66f2ffc53159d43c4a Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling@gmail.com>
Date: Sun, 7 May 2023 00:11:56 +0000
Subject: [PATCH 63/70] Use the new `load`/`store` functions in
 `{from,to}_slice`

---
 crates/core_simd/src/vector.rs | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 92984f55e45a5..a793ae9e391bf 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -263,11 +263,9 @@ where
             slice.len() >= Self::N,
             "slice length must be at least the number of elements"
         );
-        assert!(core::mem::size_of::<Self>() == Self::N * core::mem::size_of::<T>());
-        // Safety:
-        // - We've checked the length is sufficient.
-        // - `T` and `Simd<T, N>` are Copy types.
-        unsafe { slice.as_ptr().cast::<Self>().read_unaligned() }
+        // SAFETY: We just checked that the slice contains
+        // at least `N` elements.
+        unsafe { Self::load(slice.as_ptr().cast()) }
     }
 
     /// Writes a SIMD vector to the first `N` elements of a slice.
@@ -293,11 +291,9 @@ where
             slice.len() >= Self::N,
             "slice length must be at least the number of elements"
         );
-        assert!(core::mem::size_of::<Self>() == Self::N * core::mem::size_of::<T>());
-        // Safety:
-        // - We've checked the length is sufficient
-        // - `T` and `Simd<T, N>` are Copy types.
-        unsafe { slice.as_mut_ptr().cast::<Self>().write_unaligned(self) }
+        // SAFETY: We just checked that the slice contains
+        // at least `N` elements.
+        unsafe { self.store(slice.as_mut_ptr().cast()) }
     }
 
     /// Performs elementwise conversion of a SIMD vector's elements to another SIMD-valid type.

From b246e454387ef2d80078db36975d2df5d957f9fa Mon Sep 17 00:00:00 2001
From: Markus Everling <markuseverling@gmail.com>
Date: Sun, 7 May 2023 00:15:18 +0000
Subject: [PATCH 64/70] Fix inaccurate safety comments

---
 crates/core_simd/src/vector.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 92984f55e45a5..ff761fc900fab 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -156,9 +156,9 @@ where
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
     pub const fn as_array(&self) -> &[T; N] {
-        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
-        // is always valid and `Simd<T, N>` never has a lower alignment
-        // than `[T; N]`.
+        // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
+        // potential padding at the end, so pointer casting to a
+        // `&[T; N]` is safe.
         //
         // NOTE: This deliberately doesn't just use `&self.0`, see the comment
         // on the struct definition for details.
@@ -167,9 +167,9 @@ where
 
     /// Returns a mutable array reference containing the entire SIMD vector.
     pub fn as_mut_array(&mut self) -> &mut [T; N] {
-        // SAFETY: Transmuting between `Simd<T, N>` and `[T; N]`
-        // is always valid and `Simd<T, N>` never has a lower alignment
-        // than `[T; N]`.
+        // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
+        // potential padding at the end, so pointer casting to a
+        // `&mut [T; N]` is safe.
         //
         // NOTE: This deliberately doesn't just use `&mut self.0`, see the comment
         // on the struct definition for details.

From 8f50a17c37a214632c2f5cf5b8f2833a7286883b Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Sat, 22 Apr 2023 19:27:22 -0700
Subject: [PATCH 65/70] Fixups for sync

- Fix LANES over-replace
- Bring in traits
- Use less inference-heavy types
---
 crates/core_simd/src/vector.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 0253f122c98a2..3323b92e37bd0 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -2,6 +2,7 @@ use crate::simd::{
     intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdCastPtr, SimdConstPtr, SimdMutPtr,
     SimdPartialOrd, SupportedLaneCount, Swizzle,
 };
+use core::convert::{TryFrom, TryInto};
 
 /// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
 ///
@@ -109,7 +110,7 @@ where
     T: SimdElement,
 {
     /// Number of elements in this vector.
-    pub const N: usize = N;
+    pub const LANES: usize = N;
 
     /// Returns the number of elements in this SIMD vector.
     ///
@@ -122,7 +123,7 @@ where
     /// assert_eq!(v.lanes(), 4);
     /// ```
     pub const fn lanes(&self) -> usize {
-        Self::N
+        Self::LANES
     }
 
     /// Constructs a new SIMD vector with all elements set to the given value.
@@ -260,7 +261,7 @@ where
     #[must_use]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(
-            slice.len() >= Self::N,
+            slice.len() >= Self::LANES,
             "slice length must be at least the number of elements"
         );
         // SAFETY: We just checked that the slice contains
@@ -288,7 +289,7 @@ where
     /// ```
     pub fn copy_to_slice(self, slice: &mut [T]) {
         assert!(
-            slice.len() >= Self::N,
+            slice.len() >= Self::LANES,
             "slice length must be at least the number of elements"
         );
         // SAFETY: We just checked that the slice contains
@@ -883,7 +884,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
-    fn try_from(slice: &[T]) -> Result<Self, Self::Error> {
+    fn try_from(slice: &[T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }
 }
@@ -895,7 +896,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
-    fn try_from(slice: &mut [T]) -> Result<Self, Self::Error> {
+    fn try_from(slice: &mut [T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }
 }

From d361e4335f7e37d2409820510e059744d1c96457 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 10 May 2023 05:36:16 -0700
Subject: [PATCH 66/70] Drop const_ptr_read feature gate

---
 crates/core_simd/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 31e7a3617bc59..e5307de215520 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,6 +1,5 @@
 #![no_std]
 #![feature(
-    const_ptr_read,
     const_refs_to_cell,
     const_maybe_uninit_as_mut_ptr,
     const_mut_refs,

From 852762563aa890286eda2f668b8af30f8aa84216 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 10 May 2023 05:45:24 -0700
Subject: [PATCH 67/70] Temp fix for swizzle_dyn

- disable the AVX512 variant for now (flaky)
- tell Clippy to knock it off
---
 crates/core_simd/src/swizzle_dyn.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 5c3a2c1824ff4..3eb80d5dca1ff 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -43,8 +43,9 @@ where
                 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
                 // Notable absence: avx512bw shuffle
                 // If avx512bw is available, odds of avx512vbmi are good
-                #[cfg(target_feature = "avx512vbmi")]
-                64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                // FIXME: initial AVX512VBMI variant didn't actually pass muster
+                // #[cfg(target_feature = "avx512vbmi")]
+                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
                 _ => {
                     let mut array = [0; N];
                     for (i, k) in idxs.to_array().into_iter().enumerate() {
@@ -67,6 +68,7 @@ where
 #[target_feature(enable = "avx2")]
 #[allow(unused)]
 #[inline]
+#[allow(clippy::let_and_return)]
 unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
     use crate::simd::SimdPartialOrd;
     #[cfg(target_arch = "x86")]

From 4499daac77fa60a9e1ae4956caed263f2d719c0d Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 10 May 2023 18:28:26 -0700
Subject: [PATCH 68/70] Bless tests for portable-simd sync

API changes resulted in subtle MIR and impl differences
---
 .../lower_intrinsics.wrapping.LowerIntrinsics.diff        | 8 ++++----
 tests/ui/fmt/ifmt-unimpl.stderr                           | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/mir-opt/lower_intrinsics.wrapping.LowerIntrinsics.diff b/tests/mir-opt/lower_intrinsics.wrapping.LowerIntrinsics.diff
index 0bfb34acac2db..217f27efe5cc9 100644
--- a/tests/mir-opt/lower_intrinsics.wrapping.LowerIntrinsics.diff
+++ b/tests/mir-opt/lower_intrinsics.wrapping.LowerIntrinsics.diff
@@ -30,10 +30,10 @@
           _4 = _1;                         // scope 0 at $DIR/lower_intrinsics.rs:+1:45: +1:46
           StorageLive(_5);                 // scope 0 at $DIR/lower_intrinsics.rs:+1:48: +1:49
           _5 = _2;                         // scope 0 at $DIR/lower_intrinsics.rs:+1:48: +1:49
--         _3 = wrapping_add::<i32>(move _4, move _5) -> [return: bb1, unwind unreachable]; // scope 0 at $DIR/lower_intrinsics.rs:+1:14: +1:50
+-         _3 = std::intrinsics::wrapping_add::<i32>(move _4, move _5) -> [return: bb1, unwind unreachable]; // scope 0 at $DIR/lower_intrinsics.rs:+1:14: +1:50
 -                                          // mir::Constant
 -                                          // + span: $DIR/lower_intrinsics.rs:9:14: 9:44
--                                          // + literal: Const { ty: extern "rust-intrinsic" fn(i32, i32) -> i32 {wrapping_add::<i32>}, val: Value(<ZST>) }
+-                                          // + literal: Const { ty: extern "rust-intrinsic" fn(i32, i32) -> i32 {std::intrinsics::wrapping_add::<i32>}, val: Value(<ZST>) }
 +         _3 = Add(move _4, move _5);      // scope 0 at $DIR/lower_intrinsics.rs:+1:14: +1:50
 +         goto -> bb1;                     // scope 0 at $DIR/lower_intrinsics.rs:+1:14: +1:50
       }
@@ -46,10 +46,10 @@
           _7 = _1;                         // scope 1 at $DIR/lower_intrinsics.rs:+2:45: +2:46
           StorageLive(_8);                 // scope 1 at $DIR/lower_intrinsics.rs:+2:48: +2:49
           _8 = _2;                         // scope 1 at $DIR/lower_intrinsics.rs:+2:48: +2:49
--         _6 = wrapping_sub::<i32>(move _7, move _8) -> [return: bb2, unwind unreachable]; // scope 1 at $DIR/lower_intrinsics.rs:+2:14: +2:50
+-         _6 = std::intrinsics::wrapping_sub::<i32>(move _7, move _8) -> [return: bb2, unwind unreachable]; // scope 1 at $DIR/lower_intrinsics.rs:+2:14: +2:50
 -                                          // mir::Constant
 -                                          // + span: $DIR/lower_intrinsics.rs:10:14: 10:44
--                                          // + literal: Const { ty: extern "rust-intrinsic" fn(i32, i32) -> i32 {wrapping_sub::<i32>}, val: Value(<ZST>) }
+-                                          // + literal: Const { ty: extern "rust-intrinsic" fn(i32, i32) -> i32 {std::intrinsics::wrapping_sub::<i32>}, val: Value(<ZST>) }
 +         _6 = Sub(move _7, move _8);      // scope 1 at $DIR/lower_intrinsics.rs:+2:14: +2:50
 +         goto -> bb2;                     // scope 1 at $DIR/lower_intrinsics.rs:+2:14: +2:50
       }
diff --git a/tests/ui/fmt/ifmt-unimpl.stderr b/tests/ui/fmt/ifmt-unimpl.stderr
index cc316e55f5cb6..b0dddd3b1e8d0 100644
--- a/tests/ui/fmt/ifmt-unimpl.stderr
+++ b/tests/ui/fmt/ifmt-unimpl.stderr
@@ -15,7 +15,7 @@ LL |     format!("{:X}", "3");
              NonZeroI64
              NonZeroI8
              NonZeroIsize
-           and 21 others
+           and 20 others
    = note: required for `&str` to implement `UpperHex`
 note: required by a bound in `core::fmt::rt::Argument::<'a>::new_upper_hex`
   --> $SRC_DIR/core/src/fmt/rt.rs:LL:COL

From de858e7ea7b60eecb81161e607cde0002d279f4c Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 10 May 2023 21:33:38 -0700
Subject: [PATCH 69/70] miri: Move patterns for simd tests

It isn't clear to me why these error patterns do not trigger,
but I am not going to waste time analyzing bugs in compiletest.
---
 .../tests/fail/intrinsics/simd-float-to-int.rs  |  3 +--
 .../fail/intrinsics/simd-float-to-int.stderr    | 13 ++++---------
 .../miri/tests/fail/intrinsics/simd-gather.rs   |  3 +--
 .../tests/fail/intrinsics/simd-gather.stderr    | 13 ++++---------
 .../miri/tests/fail/intrinsics/simd-scatter.rs  |  3 +--
 .../tests/fail/intrinsics/simd-scatter.stderr   | 17 ++++++-----------
 6 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.rs b/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.rs
index 8905d739078a2..10939c0f1c38a 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.rs
+++ b/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.rs
@@ -1,9 +1,8 @@
-//@error-in-other-file: cannot be represented in target type `i32`
 #![feature(portable_simd)]
 use std::simd::*;
 
 fn main() {
     unsafe {
-        let _x: i32x2 = f32x2::from_array([f32::MAX, f32::MIN]).to_int_unchecked();
+        let _x: i32x2 = f32x2::from_array([f32::MAX, f32::MIN]).to_int_unchecked(); //~ERROR: cannot be represented in target type `i32`
     }
 }
diff --git a/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.stderr b/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.stderr
index 5c73c76a1613d..ea5ad62aea908 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.stderr
+++ b/src/tools/miri/tests/fail/intrinsics/simd-float-to-int.stderr
@@ -1,18 +1,13 @@
 error: Undefined Behavior: `float_to_int_unchecked` intrinsic called on 3.40282347E+38 which cannot be represented in target type `i32`
-  --> RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
+  --> $DIR/simd-float-to-int.rs:LL:CC
    |
-LL |         unsafe { intrinsics::simd_cast(self) }
-   |                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^ `float_to_int_unchecked` intrinsic called on 3.40282347E+38 which cannot be represented in target type `i32`
+LL |         let _x: i32x2 = f32x2::from_array([f32::MAX, f32::MIN]).to_int_unchecked();
+   |                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ `float_to_int_unchecked` intrinsic called on 3.40282347E+38 which cannot be represented in target type `i32`
    |
    = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
    = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
    = note: BACKTRACE:
-   = note: inside `std::simd::Simd::<f32, 2>::to_int_unchecked::<i32>` at RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
-note: inside `main`
-  --> $DIR/simd-float-to-int.rs:LL:CC
-   |
-LL |         let _x: i32x2 = f32x2::from_array([f32::MAX, f32::MIN]).to_int_unchecked();
-   |                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   = note: inside `main` at $DIR/simd-float-to-int.rs:LL:CC
 
 note: some details are omitted, run with `MIRIFLAGS=-Zmiri-backtrace=full` for a verbose backtrace
 
diff --git a/src/tools/miri/tests/fail/intrinsics/simd-gather.rs b/src/tools/miri/tests/fail/intrinsics/simd-gather.rs
index 5dd9bd8a68550..ceb7beebd8aba 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-gather.rs
+++ b/src/tools/miri/tests/fail/intrinsics/simd-gather.rs
@@ -1,4 +1,3 @@
-//@error-in-other-file: pointer to 1 byte starting at offset 9 is out-of-bounds
 #![feature(portable_simd)]
 use std::simd::*;
 
@@ -6,6 +5,6 @@ fn main() {
     unsafe {
         let vec: &[i8] = &[10, 11, 12, 13, 14, 15, 16, 17, 18];
         let idxs = Simd::from_array([9, 3, 0, 17]);
-        let _result = Simd::gather_select_unchecked(&vec, Mask::splat(true), idxs, Simd::splat(0));
+        let _result = Simd::gather_select_unchecked(&vec, Mask::splat(true), idxs, Simd::splat(0)); //~ERROR: pointer to 1 byte starting at offset 9 is out-of-bounds
     }
 }
diff --git a/src/tools/miri/tests/fail/intrinsics/simd-gather.stderr b/src/tools/miri/tests/fail/intrinsics/simd-gather.stderr
index 7512d57f6720e..f82b30a9633ee 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-gather.stderr
+++ b/src/tools/miri/tests/fail/intrinsics/simd-gather.stderr
@@ -1,18 +1,13 @@
 error: Undefined Behavior: dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
-  --> RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
+  --> $DIR/simd-gather.rs:LL:CC
    |
-LL |         unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
-   |                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
+LL |         let _result = Simd::gather_select_unchecked(&vec, Mask::splat(true), idxs, Simd::splat(0));
+   |                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
    |
    = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
    = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
    = note: BACKTRACE:
-   = note: inside `std::simd::Simd::<i8, 4>::gather_select_unchecked` at RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
-note: inside `main`
-  --> $DIR/simd-gather.rs:LL:CC
-   |
-LL |         let _result = Simd::gather_select_unchecked(&vec, Mask::splat(true), idxs, Simd::splat(0));
-   |                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+   = note: inside `main` at $DIR/simd-gather.rs:LL:CC
 
 note: some details are omitted, run with `MIRIFLAGS=-Zmiri-backtrace=full` for a verbose backtrace
 
diff --git a/src/tools/miri/tests/fail/intrinsics/simd-scatter.rs b/src/tools/miri/tests/fail/intrinsics/simd-scatter.rs
index 0a64478dc74b2..606a6b2798a08 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-scatter.rs
+++ b/src/tools/miri/tests/fail/intrinsics/simd-scatter.rs
@@ -1,4 +1,3 @@
-//@error-in-other-file: pointer to 1 byte starting at offset 9 is out-of-bounds
 #![feature(portable_simd)]
 use std::simd::*;
 
@@ -6,7 +5,7 @@ fn main() {
     unsafe {
         let mut vec: Vec<i8> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
         let idxs = Simd::from_array([9, 3, 0, 17]);
-        Simd::from_array([-27, 82, -41, 124]).scatter_select_unchecked(
+        Simd::from_array([-27, 82, -41, 124]).scatter_select_unchecked( //~ERROR: pointer to 1 byte starting at offset 9 is out-of-bounds
             &mut vec,
             Mask::splat(true),
             idxs,
diff --git a/src/tools/miri/tests/fail/intrinsics/simd-scatter.stderr b/src/tools/miri/tests/fail/intrinsics/simd-scatter.stderr
index a9ad60a0e5be2..6d959af85fa75 100644
--- a/src/tools/miri/tests/fail/intrinsics/simd-scatter.stderr
+++ b/src/tools/miri/tests/fail/intrinsics/simd-scatter.stderr
@@ -1,14 +1,4 @@
 error: Undefined Behavior: dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
-  --> RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
-   |
-LL |             intrinsics::simd_scatter(self, ptrs, enable.to_int())
-   |             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
-   |
-   = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
-   = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
-   = note: BACKTRACE:
-   = note: inside `std::simd::Simd::<i8, 4>::scatter_select_unchecked` at RUSTLIB/core/src/../../portable-simd/crates/core_simd/src/vector.rs:LL:CC
-note: inside `main`
   --> $DIR/simd-scatter.rs:LL:CC
    |
 LL | /         Simd::from_array([-27, 82, -41, 124]).scatter_select_unchecked(
@@ -16,7 +6,12 @@ LL | |             &mut vec,
 LL | |             Mask::splat(true),
 LL | |             idxs,
 LL | |         );
-   | |_________^
+   | |_________^ dereferencing pointer failed: ALLOC has size 9, so pointer to 1 byte starting at offset 9 is out-of-bounds
+   |
+   = help: this indicates a bug in the program: it performed an invalid operation, and caused Undefined Behavior
+   = help: see https://doc.rust-lang.org/nightly/reference/behavior-considered-undefined.html for further information
+   = note: BACKTRACE:
+   = note: inside `main` at $DIR/simd-scatter.rs:LL:CC
 
 note: some details are omitted, run with `MIRIFLAGS=-Zmiri-backtrace=full` for a verbose backtrace
 

From e4cecc1ab73d0422f6a5f63b6a01eb8b1be303ad Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Thu, 11 May 2023 17:22:00 -0700
Subject: [PATCH 70/70] Correct swizzle_dyn cfg for armv7 neon

---
 library/portable-simd/crates/core_simd/src/swizzle_dyn.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs b/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs
index 3eb80d5dca1ff..6065d6459378e 100644
--- a/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs
+++ b/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs
@@ -18,7 +18,7 @@ where
         #![allow(unused_imports, unused_unsafe)]
         #[cfg(target_arch = "aarch64")]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
-        #[cfg(all(target_arch = "arm", target_feature = "v7"))]
+        #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;