dotnet · jandupej · Mar 21, 2023 · Mar 20, 2023 · Mar 21, 2023 · Mar 21, 2023
@@ -1122,15 +1122,12 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_ins_g(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, (((index) << 1) | 0b1) << (type), 0b0011, (rd), (rn))
 #define arm_neon_ins_e(p, type, rd, rn, indexd, indexs) arm_neon_cpy_opcode ((p), 0b1, 0b1, (((indexd) << 1) | 0b1) << (type), (indexs) << (type), (rd), (rn))
 
-// Specific opcodes:
-#define arm_neon_dup_e_8b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_16b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_4h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_8h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_2s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_4s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn)) 
-#define arm_neon_dup_e_2d(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 4), 0b0000, (rd), (rn)) 
+#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I32) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0101, (rd), (rn))
+#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0111, (rd), (rn))
+#define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn)) 
+#define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index))
 
+// Specific opcodes:
 #define arm_neon_dup_g_8b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001, 0b0001, (rd), (rn)) 
 #define arm_neon_dup_g_16b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001, 0b0001, (rd), (rn)) 
 #define arm_neon_dup_g_4h(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010, 0b0001, (rd), (rn)) 
@@ -1141,15 +1138,13 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 
 #define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn))
 #define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn))
-#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn))
-#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn))
+#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn))
 
 #define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn))
 #define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn))
 #define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn))
 #define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0111, (rd), (rn))
 
-
 /* NEON :: 3-register same FP16 */
 // TODO
 
@@ -1572,6 +1567,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 
 /* NEON :: across lanes */
 #define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn))
+#define arm_neon_addv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b0, (type), 0b11011, (rd), (rn))
 
 #define arm_neon_umaxv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn))
 #define arm_neon_uminv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn))
@@ -1595,12 +1591,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_sminv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11010, (rd), (rn))
 #define arm_neon_sminv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11010, (rd), (rn))
 
-#define arm_neon_addv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_addv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_addv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11011, (rd), (rn))
-#define arm_neon_addv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11011, (rd), (rn))
-#define arm_neon_addv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11011, (rd), (rn))
-
 // some fp16 opcodes here: fmaxnmv, fmaxv, fminnmv, fminv
 
 #define arm_neon_uaddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b00011, (rd), (rn))
@@ -1809,6 +1799,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm))
 #define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm))
 #define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm))
+#define arm_neon_addp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b10111, (rd), (rn), (rm))
 
 // Generalized macros for float ops:
 //   width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
@@ -1822,6 +1813,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b11100, (rd), (rn), (rm))
 #define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm))
 #define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm))
+#define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm))
 
 // Generalized macros for bitwise ops:
 //	width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
@@ -1991,14 +1983,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_sqdmulh_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm))
 #define arm_neon_sqdmulh_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm))
 
-#define arm_neon_addp_8b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_16b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_4h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_8h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm))
-#define arm_neon_addp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b10111, (rd), (rn), (rm))
-
 #define arm_neon_fmaxnm_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm))
 #define arm_neon_fmaxnm_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm))
 #define arm_neon_fmaxnm_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn), (rm))
@@ -2234,10 +2218,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fmaxnmp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11000, (rd), (rn), (rm))
 #define arm_neon_fmaxnmp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11000, (rd), (rn), (rm))
 
-#define arm_neon_faddp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm))
-#define arm_neon_faddp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm))
-#define arm_neon_faddp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn), (rm))
-
 #define arm_neon_fmul_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm))
 #define arm_neon_fmul_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm))
 #define arm_neon_fmul_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11011, (rd), (rn), (rm))

@@ -477,6 +477,11 @@ main (int argc, char *argv [])
 	arm_neon_ins_e (code, TYPE_I8, ARMREG_R0, ARMREG_R1, 1, 5); // insert v1.b[5] into v0.b[1]
 	arm_neon_ins_e (code, TYPE_I32, ARMREG_R0, ARMREG_R1, 1, 2); // insert v1.s[2] into v0.s[1]
 
+	// pairwise and horizontal adds
+	arm_neon_addv (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1);
+	arm_neon_addp (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
+	arm_neon_faddp (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2);
+
 	for (i = 0; i < code - buf; ++i)
 		printf (".byte %d\n", buf [i]);
 	printf ("\n");

@@ -506,6 +506,13 @@ ones_complement: dest:x src1:x len:4
 xextract: dest:i src1:x len:12
 xbinop_forceint: dest:x src1:x src2:x len:4
 xcast: dest:x src1:x len:4 clob:1
+extract_i1: dest:i src1:x len:4
+extract_i2: dest:i src1:x len:4
+extract_i4: dest:i src1:x len:4
+extract_i8: dest:i src1:x len:4
+extract_r4: dest:f src1:x len:4
+extract_r8: dest:f src1:x len:4
+arm64_xaddv: dest:x src1:x len:8
 
 generic_class_init: src1:a len:44 clob:c
 gc_safe_point: src1:i len:12 clob:c

@@ -27,6 +27,7 @@
 #include <mono/utils/mono-memory-model.h>
 #include <mono/metadata/abi-details.h>
 #include <mono/metadata/tokentype.h>
+#include "llvm-intrinsics-types.h"
 
 #include "interp/interp.h"
 
@@ -35,6 +36,7 @@
 #define PARENTHESIZE(...) (__VA_ARGS__)
 #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__))
 #define OPFMT_WDSS _w, dreg, sreg1, sreg2
+#define OPFMT_WTDS _w, _t, dreg, sreg1
 #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2
 #define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1
 #define _UNDEF(...) g_assert_not_reached ()
@@ -3466,6 +3468,12 @@ is_type_float_macro (MonoTypeEnum type)
 	return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8); 
 }
 
+static gboolean
+is_type_unsigned_macro (MonoTypeEnum type)
+{
+	return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8);
+}
+
 static int
 get_vector_size_macro (MonoInst *ins)
 {
@@ -3736,6 +3744,56 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
 		case OP_XCAST:
 			break;
 
+		case OP_EXTRACT_I1:
+		case OP_EXTRACT_I2:
+		case OP_EXTRACT_I4:
+		case OP_EXTRACT_I8: {
+			const int t = get_type_size_macro (ins->inst_c1);
+			// smov is not defined for i64
+			if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
+				arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
+			} else {
+				arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
+			}
+			break;
+		}
+		case OP_EXTRACT_R4:
+		case OP_EXTRACT_R8:
+			if (ins->dreg != ins->sreg1 || ins->inst_c0 != 0) {
+				const int t = get_type_size_macro (ins->inst_c1);
+				// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
+				// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
+				// of the F/XREG is ignored in FREG mode, this operation remains valid.
+				arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
+			}
+			break;
+		case OP_ARM64_XADDV: {
+			switch (ins->inst_c0) {
+			case INTRINS_AARCH64_ADV_SIMD_FADDV:
+				if (ins->inst_c1 == MONO_TYPE_R8) {
+					arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
+				} else if (ins->inst_c1 == MONO_TYPE_R4) {
+					arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
+					arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
+				} else {
+					g_assert_not_reached ();
+				} 
+				break;
+
+			case INTRINS_AARCH64_ADV_SIMD_UADDV:
+			case INTRINS_AARCH64_ADV_SIMD_SADDV: 
+				if (get_type_size_macro (ins->inst_c1) == TYPE_I64) 
+					arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
+				else
+					g_assert_not_reached (); // remaining int types are handled through the codegen table
+				break;
+
+			default:
+				g_assert_not_reached ();
+			}
+			break;
+		}
+
 			/* BRANCH */
 		case OP_BR:
 			mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb, MONO_R_ARM64_B);

@@ -62,3 +62,6 @@ SIMD_OP  (128, OP_XBINOP,      OP_FMIN,              WTDSS,              _UNDEF,
 SIMD_OP  (128, OP_XBINOP_FORCEINT,    XBINOP_FORCEINT_AND,    WDSS,      arm_neon_and,     arm_neon_and,     arm_neon_and,    arm_neon_and,      arm_neon_and,     arm_neon_and)
 SIMD_OP  (128, OP_XBINOP_FORCEINT,    XBINOP_FORCEINT_OR,     WDSS,      arm_neon_orr,     arm_neon_orr,     arm_neon_orr,    arm_neon_orr,      arm_neon_orr,     arm_neon_orr)
 SIMD_OP  (128, OP_XBINOP_FORCEINT,    XBINOP_FORCEINT_XOR,    WDSS,      arm_neon_eor,     arm_neon_eor,     arm_neon_eor,    arm_neon_eor,      arm_neon_eor,     arm_neon_eor)
+SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_UADDV, WTDS,     arm_neon_addv,    arm_neon_addv,    arm_neon_addv,   _SKIP,             _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_SADDV, WTDS,     arm_neon_addv,    arm_neon_addv,    arm_neon_addv,   _SKIP,             _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS,     _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            _SKIP,            _SKIP)
@@ -620,15 +620,23 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t
 		return ins;
 	}
 
-	MonoInst *ins = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1);
-
+	MonoInst *sum = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1);
 	if (type_enum_is_float (element_type)) {
-		ins->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV;
+		sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV;
+		sum->inst_c1 = element_type;
 	} else {
-		ins->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV;
+		sum->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV;
+		sum->inst_c1 = element_type;
 	}
 
-	return ins;
+	if (COMPILE_LLVM (cfg)) {
+		return sum;
+	} else {
+		MonoInst *ins = emit_simd_ins (cfg, vector_class, type_to_extract_op (element_type), sum->dreg, -1);
+		ins->inst_c0 = 0;
+		ins->inst_c1 = element_type;
+		return ins;
+	}
 }
 #endif
 #ifdef TARGET_WASM
@@ -1250,6 +1258,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		case SN_AsUInt64:
 		case SN_Max:
 		case SN_Min:
+		case SN_Sum:
+		case SN_ToScalar:
 			break;
 		default: 
 			return NULL;