bytecodealliance · alexcrichton · Feb 27, 2023 · Feb 24, 2023
@@ -4489,6 +4489,8 @@
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)
 (convert Amode XmmMemAligned amode_to_xmm_mem_aligned)
 (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
+(convert VCodeConstant SyntheticAmode const_to_synthetic_amode)
+(convert VCodeConstant XmmMem const_to_xmm_mem)
 
 (convert IntCC CC intcc_to_cc)
 (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@@ -4537,6 +4539,8 @@
       (synthetic_amode_to_reg_mem amode))
 (decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode)
 (extern constructor const_to_synthetic_amode const_to_synthetic_amode)
+(decl const_to_xmm_mem (VCodeConstant) XmmMem)
+(rule (const_to_xmm_mem c) (const_to_synthetic_amode c))
 
 (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
 (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))

@@ -1908,7 +1908,7 @@
 (rule (lower (has_type $I8X16
                        (popcnt src)))
       (let ((nibble_table_const VCodeConstant (popcount_4bit_table))
-            (low_mask Xmm (x64_xmm_load_const $I8X16 (popcount_low_mask)))
+            (low_mask XmmMem (popcount_low_mask))
             (low_nibbles Xmm (sse_and $I8X16 src low_mask))
             ;; Note that this is a 16x8 shift, but that's OK; we mask
             ;; off anything that traverses from one byte to the next
@@ -2990,9 +2990,9 @@
 ;; every value of the mantissa represents a corresponding uint32 number.
 ;; When we subtract 0x1.0p52 we are left with double(src).
 (rule 1 (lower (has_type $F64X2 (fcvt_from_uint (uwiden_low val @ (value_type $I32X4)))))
-      (let ((uint_mask Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_const)))
+      (let ((uint_mask XmmMem (fcvt_uint_mask_const))
             (res Xmm (x64_unpcklps val uint_mask))
-            (uint_mask_high Xmm (x64_xmm_load_const $I32X4 (fcvt_uint_mask_high_const))))
+            (uint_mask_high XmmMem (fcvt_uint_mask_high_const)))
         (x64_subpd res uint_mask_high)))
 
 ;; When AVX512VL and AVX512F are available,
@@ -3190,27 +3190,27 @@
         (has_type $I32X4 (iadd_pairwise
                            (swiden_low val @ (value_type $I16X8))
                            (swiden_high val))))
-      (let ((mul_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32))))
+      (let ((mul_const XmmMem (iadd_pairwise_mul_const_32)))
         (x64_pmaddwd val mul_const)))
 
 (rule (lower
         (has_type $I16X8 (iadd_pairwise
                            (uwiden_low val @ (value_type $I8X16))
                            (uwiden_high val))))
-      (let ((mul_const Xmm (x64_xmm_load_const $I8X16 (iadd_pairwise_mul_const_16))))
+      (let ((mul_const XmmMem (iadd_pairwise_mul_const_16)))
         (x64_pmaddubsw val mul_const)))
 
 (rule (lower
         (has_type $I32X4 (iadd_pairwise
                            (uwiden_low val @ (value_type $I16X8))
                            (uwiden_high val))))
-      (let ((xor_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_xor_const_32)))
+      (let ((xor_const XmmMem (iadd_pairwise_xor_const_32))
             (dst Xmm (x64_pxor val xor_const))
 
-            (madd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_mul_const_32)))
+            (madd_const XmmMem (iadd_pairwise_mul_const_32))
             (dst Xmm (x64_pmaddwd dst madd_const))
 
-            (addd_const Xmm (x64_xmm_load_const $I16X8 (iadd_pairwise_addd_const_32))))
+            (addd_const XmmMem (iadd_pairwise_addd_const_32)))
         (x64_paddd dst addd_const)))
 
 ;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3288,7 +3288,7 @@
             ;; CVTTPD2DQ xmm_y, xmm_y
 
             (tmp1 Xmm (x64_cmppd a a (FcmpImm.Equal)))
-            (umax_mask Xmm (x64_xmm_load_const $F64X2 (snarrow_umax_mask)))
+            (umax_mask XmmMem (snarrow_umax_mask))
 
             ;; ANDPD xmm_y, [wasm_f64x2_splat(2147483647.0)]
             (tmp1 Xmm (x64_andps tmp1 umax_mask))
@@ -3504,17 +3504,15 @@
 ;; indices (may not be completely necessary: verification could fail incorrect
 ;; mask values) and fix the indexes to all point to the `dst` vector.
 (rule 3 (lower (shuffle a a (vec_mask_from_immediate mask)))
-      (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_31_mask mask))))
+      (x64_pshufb a (shuffle_0_31_mask mask)))
 
 ;; For the case where the shuffle mask contains out-of-bounds values (values
 ;; greater than 31) we must mask off those resulting values in the result of
 ;; `vpermi2b`.
 (rule 2 (lower (has_type (and (avx512vl_enabled $true) (avx512vbmi_enabled $true))
                          (shuffle a b (vec_mask_from_immediate
                                         (perm_from_mask_with_zeros mask zeros)))))
-      (x64_andps
-        (x64_xmm_load_const $I8X16 zeros)
-        (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask))))
+      (x64_andps (x64_vpermi2b b a (x64_xmm_load_const $I8X16 mask)) zeros))
 
 ;; However, if the shuffle mask contains no out-of-bounds values, we can use
 ;; `vpermi2b` without any masking.
@@ -3527,8 +3525,8 @@
 ;; above, we build the `constructed_mask` for each case statically.
 (rule (lower (shuffle a b (vec_mask_from_immediate mask)))
       (x64_por
-        (x64_pshufb a (x64_xmm_load_const $I8X16 (shuffle_0_15_mask mask)))
-        (x64_pshufb b (x64_xmm_load_const $I8X16 (shuffle_16_31_mask mask)))))
+        (x64_pshufb a (shuffle_0_15_mask mask))
+        (x64_pshufb b (shuffle_16_31_mask mask))))
 
 ;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -3539,9 +3537,7 @@
 ;; Wasm SIMD semantics for this instruction. The instruction format maps to
 ;; variables like: %dst = swizzle %src, %mask
 (rule (lower (swizzle src mask))
-      (let ((mask Xmm (x64_paddusb
-                        mask
-                        (x64_xmm_load_const $I8X16 (swizzle_zero_mask)))))
+      (let ((mask Xmm (x64_paddusb mask (swizzle_zero_mask))))
         (x64_pshufb src mask)))
 
 ;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3716,9 +3712,9 @@
       (let ((src1 Xmm qx)
             (src2 Xmm qy)
 
-            (mask Xmm (x64_xmm_load_const $I16X8 (sqmul_round_sat_mask)))
+            (mask XmmMem (sqmul_round_sat_mask))
             (dst Xmm (x64_pmulhrsw src1 src2))
-            (cmp Xmm (x64_pcmpeqw mask dst)))
+            (cmp Xmm (x64_pcmpeqw dst mask)))
         (x64_pxor dst cmp)))
 
 ;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3744,7 +3740,7 @@
             (zeros Xmm (xmm_zero $F64X2))
             (dst Xmm (x64_maxpd src zeros))
 
-            (umax_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_umax_mask)))
+            (umax_mask XmmMem (uunarrow_umax_mask))
 
             ;; MINPD xmm_y, [wasm_f64x2_splat(4294967295.0)]
             (dst Xmm (x64_minpd dst umax_mask))
@@ -3753,7 +3749,7 @@
             (dst Xmm (x64_roundpd dst (RoundImm.RoundZero)))
 
             ;; ADDPD xmm_y, [wasm_f64x2_splat(0x1.0p+52)]
-            (uint_mask Xmm (x64_xmm_load_const $F64X2 (uunarrow_uint_mask)))
+            (uint_mask XmmMem (uunarrow_uint_mask))
             (dst Xmm (x64_addpd dst uint_mask)))
 
         ;; SHUFPS xmm_y, xmm_xmp, 0x88

@@ -304,10 +304,8 @@ block0(v0: i32x4):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqu  const(0), %xmm2
-;   unpcklps %xmm0, %xmm2, %xmm0
-;   movdqu  const(1), %xmm6
-;   subpd   %xmm0, %xmm6, %xmm0
+;   unpcklps %xmm0, const(0), %xmm0
+;   subpd   %xmm0, const(1), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -317,14 +315,16 @@ block0(v0: i32x4):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqu 0x14(%rip), %xmm2
-;   unpcklps %xmm2, %xmm0
-;   movdqu 0x19(%rip), %xmm6
-;   subpd %xmm6, %xmm0
+;   unpcklps 0x15(%rip), %xmm0
+;   subpd 0x1d(%rip), %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
 ;   xorb %al, (%rbx)
 ;   addb %dh, (%rax)
 ;   addb %al, (%r8)

@@ -566,10 +566,9 @@ block0(v0: f64x2):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   vcmppd  $0 %xmm0, %xmm0, %xmm2
-;   movupd  const(0), %xmm4
-;   vandps  %xmm2, %xmm4, %xmm6
-;   vminpd  %xmm0, %xmm6, %xmm8
-;   vcvttpd2dq %xmm8, %xmm0
+;   vandps  %xmm2, const(0), %xmm4
+;   vminpd  %xmm0, %xmm4, %xmm6
+;   vcvttpd2dq %xmm6, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -580,19 +579,13 @@ block0(v0: f64x2):
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   vcmpeqpd %xmm0, %xmm0, %xmm2
-;   movupd 0x1f(%rip), %xmm4
-;   vandps %xmm4, %xmm2, %xmm6
-;   vminpd %xmm6, %xmm0, %xmm8
-;   vcvttpd2dq %xmm8, %xmm0
+;   vandps 0xf(%rip), %xmm2, %xmm4
+;   vminpd %xmm4, %xmm0, %xmm6
+;   vcvttpd2dq %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, (%rax)
-;   addb %al, %al
+;   sarb $0xff, %bh
 
@@ -63,13 +63,12 @@ block0(v0: f64x2):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm4
-;   cmppd   $0, %xmm4, %xmm0, %xmm4
-;   movupd  const(0), %xmm5
-;   andps   %xmm4, %xmm5, %xmm4
-;   movdqa  %xmm0, %xmm8
-;   minpd   %xmm8, %xmm4, %xmm8
-;   cvttpd2dq %xmm8, %xmm0
+;   movdqa  %xmm0, %xmm3
+;   cmppd   $0, %xmm3, %xmm0, %xmm3
+;   andps   %xmm3, const(0), %xmm3
+;   movdqa  %xmm0, %xmm6
+;   minpd   %xmm6, %xmm3, %xmm6
+;   cvttpd2dq %xmm6, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -79,20 +78,22 @@ block0(v0: f64x2):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm4
-;   cmpeqpd %xmm0, %xmm4
-;   movupd 0x1b(%rip), %xmm5
-;   andps %xmm5, %xmm4
-;   movdqa %xmm0, %xmm8
-;   minpd %xmm4, %xmm8
-;   cvttpd2dq %xmm8, %xmm0
+;   movdqa %xmm0, %xmm3
+;   cmpeqpd %xmm0, %xmm3
+;   andps 0x1c(%rip), %xmm3
+;   movdqa %xmm0, %xmm6
+;   minpd %xmm3, %xmm6
+;   cvttpd2dq %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   sarb $0xff, %bh
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, %al
 
 function %f4(i16x8, i16x8) -> i8x16 {
 block0(v0: i16x8, v1: i16x8):

@@ -55,12 +55,11 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm7
-;   movdqu  const(1), %xmm0
-;   movdqu  const(0), %xmm6
-;   movdqa  %xmm7, %xmm9
-;   vpermi2b %xmm1, %xmm9, %xmm6, %xmm6
-;   andps   %xmm0, %xmm6, %xmm0
+;   movdqa  %xmm0, %xmm6
+;   movdqu  const(0), %xmm0
+;   movdqa  %xmm6, %xmm7
+;   vpermi2b %xmm1, %xmm7, %xmm0, %xmm0
+;   andps   %xmm0, const(1), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -70,12 +69,11 @@ block0(v0: i8x16, v1: i8x16):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm0, %xmm7
-;   movdqu 0x30(%rip), %xmm0
-;   movdqu 0x18(%rip), %xmm6
-;   movdqa %xmm7, %xmm9
-;   vpermi2b %xmm1, %xmm9, %xmm6
-;   andps %xmm6, %xmm0
+;   movdqa %xmm0, %xmm6
+;   movdqu 0x20(%rip), %xmm0
+;   movdqa %xmm6, %xmm7
+;   vpermi2b %xmm1, %xmm7, %xmm0
+;   andps 0x1f(%rip), %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -89,7 +87,9 @@ block0(v0: i8x16, v1: i8x16):
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
-;   cmpb $0xff, %bh
+;   addb %al, (%rax)
+;   addb %al, (%rax)
+;   addb %al, -1(%rax)
 
 function %f3(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):