Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mono][jit] Add vector horizontal sums and ToScalar on arm64. #83675

Merged
merged 5 commits into from
Mar 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 9 additions & 29 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1122,15 +1122,12 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_ins_g(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, (((index) << 1) | 0b1) << (type), 0b0011, (rd), (rn))
#define arm_neon_ins_e(p, type, rd, rn, indexd, indexs) arm_neon_cpy_opcode ((p), 0b1, 0b1, (((indexd) << 1) | 0b1) << (type), (indexs) << (type), (rd), (rn))

// Specific opcodes:
#define arm_neon_dup_e_8b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn))
#define arm_neon_dup_e_16b(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001 | ((index) << 1), 0b0000, (rd), (rn))
#define arm_neon_dup_e_4h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn))
#define arm_neon_dup_e_8h(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00010 | ((index) << 2), 0b0000, (rd), (rn))
#define arm_neon_dup_e_2s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn))
#define arm_neon_dup_e_4s(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 3), 0b0000, (rd), (rn))
#define arm_neon_dup_e_2d(p, rd, rn, index) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00100 | ((index) << 4), 0b0000, (rd), (rn))
#define arm_neon_smov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I32) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0101, (rd), (rn))
#define arm_neon_umov(p, type, rd, rn, index) arm_neon_cpy_opcode ((p), (type == TYPE_I64) ? 0b1 : 0b0, 0b0, (0b00001 << (type)) | ((index) << ((type) + 1)), 0b0111, (rd), (rn))
#define arm_neon_dup_e(p, width, type, rd, rn, index) arm_neon_cpy_opcode ((p), (width), 0b0, (0b00001 << (type)) | ((index) << ((type)+1)), 0b0000, (rd), (rn))
#define arm_neon_fdup_e(p, width, type, rd, rn, index) arm_neon_dup_e ((p), (width), (type) + TYPE_I32, (rd), (rn), (index))

// Specific opcodes:
#define arm_neon_dup_g_8b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00001, 0b0001, (rd), (rn))
#define arm_neon_dup_g_16b(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_FULL, 0b0, 0b00001, 0b0001, (rd), (rn))
#define arm_neon_dup_g_4h(p, rd, rn) arm_neon_cpy_opcode ((p), VREG_LOW, 0b0, 0b00010, 0b0001, (rd), (rn))
Expand All @@ -1141,15 +1138,13 @@ arm_encode_arith_imm (int imm, guint32 *shift)

#define arm_neon_smov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0101, (rd), (rn))
#define arm_neon_smov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0101, (rd), (rn))
#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn))
#define arm_neon_smov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0101, (rd), (rn))
#define arm_neon_smov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b00100 | ((index) << 3), 0b0101, (rd), (rn))

#define arm_neon_umov_b(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00001 | ((index) << 1), 0b0111, (rd), (rn))
#define arm_neon_umov_h(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00010 | ((index) << 2), 0b0111, (rd), (rn))
#define arm_neon_umov_s(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b0, 0b0, 0b00100 | ((index) << 3), 0b0111, (rd), (rn))
#define arm_neon_umov_d(p, rd, rn, index) arm_neon_cpy_opcode ((p), 0b1, 0b0, 0b01000 | ((index) << 4), 0b0111, (rd), (rn))


/* NEON :: 3-register same FP16 */
// TODO

Expand Down Expand Up @@ -1572,6 +1567,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)

/* NEON :: across lanes */
#define arm_neon_xln_opcode(p, q, u, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110001100000000100000000000 | (u) << 29 | (size) << 22 | (opcode) << 12, (rd), (rn))
#define arm_neon_addv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b0, (type), 0b11011, (rd), (rn))

#define arm_neon_umaxv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn))
#define arm_neon_uminv(p, width, type, rd, rn) arm_neon_xln_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn))
Expand All @@ -1595,12 +1591,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_sminv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11010, (rd), (rn))
#define arm_neon_sminv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11010, (rd), (rn))

#define arm_neon_addv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11011, (rd), (rn))
#define arm_neon_addv_16b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11011, (rd), (rn))
#define arm_neon_addv_4h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11011, (rd), (rn))
#define arm_neon_addv_8h(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11011, (rd), (rn))
#define arm_neon_addv_4s(p, rd, rn) arm_neon_xln_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b11011, (rd), (rn))

// some fp16 opcodes here: fmaxnmv, fmaxv, fminnmv, fminv

#define arm_neon_uaddlv_8b(p, rd, rn) arm_neon_xln_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b00011, (rd), (rn))
Expand Down Expand Up @@ -1809,6 +1799,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_cmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b10001, (rd), (rn), (rm))
#define arm_neon_cmhi(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00110, (rd), (rn), (rm))
#define arm_neon_cmhs(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b00111, (rd), (rn), (rm))
#define arm_neon_addp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b10111, (rd), (rn), (rm))

// Generalized macros for float ops:
// width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
Expand All @@ -1822,6 +1813,7 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_fcmeq(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b0, (type), 0b11100, (rd), (rn), (rm))
#define arm_neon_fcmge(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11100, (rd), (rn), (rm))
#define arm_neon_fcmgt(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11100, (rd), (rn), (rm))
#define arm_neon_faddp(p, width, type, rd, rn, rm) arm_neon_3svec_opcode ((p), (width), 0b1, (type), 0b11010, (rd), (rn), (rm))

// Generalized macros for bitwise ops:
// width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
Expand Down Expand Up @@ -1991,14 +1983,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_sqdmulh_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm))
#define arm_neon_sqdmulh_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10110, (rd), (rn), (rm))

#define arm_neon_addp_8b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_16b(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_4h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_8h(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10111, (rd), (rn), (rm))
#define arm_neon_addp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b10111, (rd), (rn), (rm))

#define arm_neon_fmaxnm_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm))
#define arm_neon_fmaxnm_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn), (rm))
#define arm_neon_fmaxnm_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn), (rm))
Expand Down Expand Up @@ -2234,10 +2218,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define arm_neon_fmaxnmp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11000, (rd), (rn), (rm))
#define arm_neon_fmaxnmp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11000, (rd), (rn), (rm))

#define arm_neon_faddp_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm))
#define arm_neon_faddp_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn), (rm))
#define arm_neon_faddp_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn), (rm))

#define arm_neon_fmul_2s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm))
#define arm_neon_fmul_4s(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11011, (rd), (rn), (rm))
#define arm_neon_fmul_2d(p, rd, rn, rm) arm_neon_3svec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11011, (rd), (rn), (rm))
Expand Down
5 changes: 5 additions & 0 deletions src/mono/mono/arch/arm64/codegen-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,11 @@ main (int argc, char *argv [])
arm_neon_ins_e (code, TYPE_I8, ARMREG_R0, ARMREG_R1, 1, 5); // insert v1.b[5] into v0.b[1]
arm_neon_ins_e (code, TYPE_I32, ARMREG_R0, ARMREG_R1, 1, 2); // insert v1.s[2] into v0.s[1]

// pairwise and horizontal adds
arm_neon_addv (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1);
arm_neon_addp (code, VREG_FULL, TYPE_I8, ARMREG_R0, ARMREG_R1, ARMREG_R2);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ARMREG_R0, ARMREG_R1, ARMREG_R2);

for (i = 0; i < code - buf; ++i)
printf (".byte %d\n", buf [i]);
printf ("\n");
Expand Down
7 changes: 7 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,13 @@ ones_complement: dest:x src1:x len:4
xextract: dest:i src1:x len:12
xbinop_forceint: dest:x src1:x src2:x len:4
xcast: dest:x src1:x len:4 clob:1
extract_i1: dest:i src1:x len:4
extract_i2: dest:i src1:x len:4
extract_i4: dest:i src1:x len:4
extract_i8: dest:i src1:x len:4
extract_r4: dest:f src1:x len:4
extract_r8: dest:f src1:x len:4
arm64_xaddv: dest:x src1:x len:8

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
58 changes: 58 additions & 0 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <mono/utils/mono-memory-model.h>
#include <mono/metadata/abi-details.h>
#include <mono/metadata/tokentype.h>
#include "llvm-intrinsics-types.h"

#include "interp/interp.h"

Expand All @@ -35,6 +36,7 @@
#define PARENTHESIZE(...) (__VA_ARGS__)
#define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__))
#define OPFMT_WDSS _w, dreg, sreg1, sreg2
#define OPFMT_WTDS _w, _t, dreg, sreg1
#define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2
#define OPFMT_WTDSS_REV _w, _t, dreg, sreg2, sreg1
#define _UNDEF(...) g_assert_not_reached ()
Expand Down Expand Up @@ -3466,6 +3468,12 @@ is_type_float_macro (MonoTypeEnum type)
return (type == MONO_TYPE_R4 || type == MONO_TYPE_R8);
}

static gboolean
is_type_unsigned_macro (MonoTypeEnum type)
{
return (type == MONO_TYPE_U1 || type == MONO_TYPE_U2 || type == MONO_TYPE_U4 || type == MONO_TYPE_U8);
}

static int
get_vector_size_macro (MonoInst *ins)
{
Expand Down Expand Up @@ -3736,6 +3744,56 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_XCAST:
break;

case OP_EXTRACT_I1:
case OP_EXTRACT_I2:
case OP_EXTRACT_I4:
case OP_EXTRACT_I8: {
const int t = get_type_size_macro (ins->inst_c1);
// smov is not defined for i64
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
} else {
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
}
break;
}
case OP_EXTRACT_R4:
case OP_EXTRACT_R8:
if (ins->dreg != ins->sreg1 || ins->inst_c0 != 0) {
const int t = get_type_size_macro (ins->inst_c1);
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
// of the F/XREG is ignored in FREG mode, this operation remains valid.
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
}
break;
case OP_ARM64_XADDV: {
switch (ins->inst_c0) {
case INTRINS_AARCH64_ADV_SIMD_FADDV:
if (ins->inst_c1 == MONO_TYPE_R8) {
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
} else if (ins->inst_c1 == MONO_TYPE_R4) {
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
} else {
g_assert_not_reached ();
}
break;

case INTRINS_AARCH64_ADV_SIMD_UADDV:
case INTRINS_AARCH64_ADV_SIMD_SADDV:
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
else
g_assert_not_reached (); // remaining int types are handled through the codegen table
break;

default:
g_assert_not_reached ();
}
break;
}

/* BRANCH */
case OP_BR:
mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_target_bb, MONO_R_ARM64_B);
Expand Down
3 changes: 3 additions & 0 deletions src/mono/mono/mini/simd-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,6 @@ SIMD_OP (128, OP_XBINOP, OP_FMIN, WTDSS, _UNDEF,
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_AND, WDSS, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and, arm_neon_and)
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_OR, WDSS, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr, arm_neon_orr)
SIMD_OP (128, OP_XBINOP_FORCEINT, XBINOP_FORCEINT_XOR, WDSS, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor, arm_neon_eor)
SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_UADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF)
SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_SADDV, WTDS, arm_neon_addv, arm_neon_addv, arm_neon_addv, _SKIP, _UNDEF, _UNDEF)
SIMD_OP (128, OP_ARM64_XADDV, INTRINS_AARCH64_ADV_SIMD_FADDV, WTDS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, _SKIP, _SKIP)
20 changes: 15 additions & 5 deletions src/mono/mono/mini/simd-intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -620,15 +620,23 @@ emit_sum_vector (MonoCompile *cfg, MonoType *vector_type, MonoTypeEnum element_t
return ins;
}

MonoInst *ins = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1);

MonoInst *sum = emit_simd_ins (cfg, vector_class, OP_ARM64_XADDV, arg->dreg, -1);
if (type_enum_is_float (element_type)) {
ins->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV;
sum->inst_c0 = INTRINS_AARCH64_ADV_SIMD_FADDV;
sum->inst_c1 = element_type;
} else {
ins->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV;
sum->inst_c0 = type_enum_is_unsigned (element_type) ? INTRINS_AARCH64_ADV_SIMD_UADDV : INTRINS_AARCH64_ADV_SIMD_SADDV;
sum->inst_c1 = element_type;
}

return ins;
if (COMPILE_LLVM (cfg)) {
return sum;
} else {
MonoInst *ins = emit_simd_ins (cfg, vector_class, type_to_extract_op (element_type), sum->dreg, -1);
ins->inst_c0 = 0;
ins->inst_c1 = element_type;
return ins;
}
}
#endif
#ifdef TARGET_WASM
Expand Down Expand Up @@ -1250,6 +1258,8 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
case SN_AsUInt64:
case SN_Max:
case SN_Min:
case SN_Sum:
case SN_ToScalar:
break;
default:
return NULL;
Expand Down