Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono] Add initial arm64 hardware intrinsics support for mini JIT #82420

Merged
merged 15 commits into from
Mar 6, 2023
Merged
18 changes: 15 additions & 3 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,20 @@ arm_encode_imm7 (int imm, int size)
#define arm_format_ldrfp_imm(p, size, opc, rt, rn, pimm, scale) arm_emit ((p), ((size) << 30) | (0xf << 26) | (0x1 << 24) | ((opc) << 22) | (arm_encode_pimm12 ((pimm), (scale)) << 10) | ((rn) << 5) | ((rt) << 0))

/* Load double */
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, dt, xn, simm, 8)
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, (dt), (xn), (simm), 8)
/* Load single */
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, dt, xn, simm, 4)
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, (dt), (xn), (simm), 4)
/* Load 128 bit */
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0, 0x3, qt, xn, simm, 16)
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0x0, 0x3, (qt), (xn), (simm), 16)

/* LDR (literal, SIMD&FP) PC-relative*/
/* Load single */
#define arm_neon_ldrs_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b00 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load double */
#define arm_neon_ldrd_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b01 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load 128 bit */
#define arm_neon_ldrq_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b10 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
#define arm_neon_ldrq_lit_fixup(p, target) *((guint32*)p) = (*((guint32*)p) & 0xff00001f) | (arm_get_disp19 ((p), (target)) << 5)

/* Arithmetic (immediate) */
static G_GNUC_UNUSED inline guint32
Expand Down Expand Up @@ -1000,6 +1009,9 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define TYPE_F32 0
#define TYPE_F64 1

/* NEON :: move SIMD register*/
#define arm_neon_mov(p, rd, rn) arm_neon_orr ((p), VREG_FULL, (rd), (rn), (rn))

/* NEON :: AES */
#define arm_neon_aes_opcode(p, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), VREG_FULL, 0b00001110001010000000100000000000 | (size) << 22 | (opcode) << 12, (rd), (rn))
#define arm_neon_aese(p, rd, rn) arm_neon_aes_opcode ((p), 0b00, 0b00100, (rd), (rn))
Expand Down
6 changes: 6 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ r8const: dest:f len:20
label: len:0
store_membase_imm: dest:b len:20
store_membase_reg: dest:b src1:i len:20
storex_membase: dest:b src1:x len:12
storei1_membase_imm: dest:b len:20
storei1_membase_reg: dest:b src1:i len:12
storei2_membase_imm: dest:b len:20
Expand All @@ -135,6 +136,7 @@ storei1_memindex: dest:b src1:i src2:i len:4
storei2_memindex: dest:b src1:i src2:i len:4
storei4_memindex: dest:b src1:i src2:i len:4
load_membase: dest:i src1:b len:20
loadx_membase: dest:x src1:b len:12
loadi1_membase: dest:i src1:b len:32
loadu1_membase: dest:i src1:b len:32
loadi2_membase: dest:i src1:b len:32
Expand Down Expand Up @@ -493,6 +495,10 @@ atomic_store_i8: dest:b src1:i len:20
atomic_store_u8: dest:b src1:i len:20
atomic_store_r4: dest:b src1:f len:28
atomic_store_r8: dest:b src1:f len:24
xbinop: dest:x src1:x src2:x len:4
xzero: dest:x len:4
xmove: dest:x src1:x len:4
xconst: dest:x len:10

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
153 changes: 151 additions & 2 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,20 @@ emit_strfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_strfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_strfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -717,6 +731,20 @@ emit_ldrfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_ldrfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_ldrfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_ldrfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

guint8*
mono_arm_emit_ldrx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -2209,8 +2237,15 @@ mono_arch_allocate_vars (MonoCompile *cfg)
cfg->ret->dreg = cinfo->ret.reg;
break;
case ArgVtypeInIRegs:
case ArgHFA:
case ArgHFA: {
/* Allocate a local to hold the result, the epilog will copy it to the correct place */
MonoType *ret_type = mini_get_underlying_type (sig->ret);
MonoClass *klass = mono_class_from_mono_type_internal (ret_type);
if (MONO_CLASS_IS_SIMD (cfg, klass)) {
int align_simd = mono_type_size (m_class_get_byval_arg (klass), NULL);
offset = ALIGN_TO (offset, align_simd);
}

cfg->ret->opcode = OP_REGOFFSET;
cfg->ret->inst_basereg = cfg->frame_reg;
cfg->ret->inst_offset = offset;
Expand All @@ -2220,6 +2255,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
else
offset += 16;
break;
}
case ArgVtypeByRef:
/* This variable will be initialized in the prolog from R8 */
cfg->vret_addr->opcode = OP_REGOFFSET;
Expand Down Expand Up @@ -2377,7 +2413,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
ins->opcode = OP_REGOFFSET;
ins->inst_basereg = cfg->frame_reg;
ins->inst_offset = offset + offsets [i];
//printf ("allocated local %d to ", i); mono_print_tree_nl (ins);
//printf ("allocated local %d to ", i); mono_print_ins (ins);
}
}
offset += locals_stack_size;
Expand Down Expand Up @@ -3235,6 +3271,52 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
return code;
}

static int
get_vector_size_macro (MonoInst *ins)
{
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
return VREG_FULL;
case 8:
return VREG_LOW;
default:
g_assert_not_reached ();
}
}

static int
get_type_size_macro (MonoTypeEnum type)
{
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return TYPE_I8;
case MONO_TYPE_I2:
case MONO_TYPE_U2:
return TYPE_I16;
case MONO_TYPE_I4:
case MONO_TYPE_U4:
return TYPE_I32;
case MONO_TYPE_I8:
case MONO_TYPE_U8:
return TYPE_I64;
case MONO_TYPE_I:
case MONO_TYPE_U:
#if TARGET_SIZEOF_VOID_P == 8
return TYPE_I64;
#else
return TYPE_I32;
#endif
case MONO_TYPE_R4:
return TYPE_F32;
case MONO_TYPE_R8:
return TYPE_F64;
default:
g_assert_not_reached ();
}
}

void
mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
{
Expand Down Expand Up @@ -3412,6 +3494,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
case OP_LOADX_MEMBASE:
code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XMOVE:
arm_neon_mov (code, dreg, sreg1);
break;
case OP_XCONST: {
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
}
break;
}

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3484,6 +3589,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_true_bb, MONO_R_ARM64_CBZ);
arm_cbnzx (code, sreg1, 0);
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IADD:
arm_neon_add (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_FADD:
arm_neon_fadd (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down Expand Up @@ -5265,6 +5382,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
size += 32;
exc_throw_found [i] = TRUE;
}
} else if (ji->type == MONO_PATCH_INFO_X128) {
size += 16 + 15; /* sizeof (Vector128<T>) + alignment */
}
}

Expand Down Expand Up @@ -5306,6 +5425,36 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
set_code_cursor (cfg, code);
}

/* Handle relocations with RIP relative addressing */
for (ji = cfg->patch_info; ji; ji = ji->next) {
gboolean remove = FALSE;

if (ji->type == MONO_PATCH_INFO_X128) {
guint8 *pos;

code = (guint8*)ALIGN_TO (code, 16);
pos = cfg->native_code + ji->ip.i;
arm_neon_ldrq_lit_fixup (pos, code);
memcpy (code, ji->data.target, 16);
code += 16;

remove = TRUE;
}

if (remove) {
if (ji == cfg->patch_info)
cfg->patch_info = ji->next;
else {
MonoJumpInfo *tmp;

for (tmp = cfg->patch_info; tmp->next != ji; tmp = tmp->next)
;
tmp->next = ji->next;
}
}
set_code_cursor (cfg, code);
}

set_code_cursor (cfg, code);
}

Expand Down
4 changes: 3 additions & 1 deletion src/mono/mono/mini/mini-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#if !defined(DISABLE_SIMD)
#define MONO_ARCH_SIMD_INTRINSICS 1
#define MONO_ARCH_NEED_SIMD_BANK 1
#define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1
#endif
vargaz marked this conversation as resolved.
Show resolved Hide resolved

#define MONO_CONTEXT_SET_LLVM_EXC_REG(ctx, exc) do { (ctx)->regs [0] = (gsize)exc; } while (0)
Expand Down Expand Up @@ -52,7 +54,7 @@
/* v8..v15 */
#define MONO_ARCH_CALLEE_SAVED_FREGS 0xff00

#define MONO_ARCH_CALLEE_SAVED_XREGS 0
#define MONO_ARCH_CALLEE_SAVED_XREGS MONO_ARCH_CALLEE_SAVED_FREGS

#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS

Expand Down
9 changes: 3 additions & 6 deletions src/mono/mono/mini/mini.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,18 +1502,15 @@ mono_allocate_stack_slots (MonoCompile *cfg, gboolean backward, guint32 *stack_s
* Align the size too so the code generated for passing vtypes in
* registers doesn't overwrite random locals.
*/
size = (size + (align - 1)) & ~(align -1);
size = ALIGN_TO (size, align);
}

if (backward) {
offset += size;
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset + size, align);
slot = offset;
}
else {
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset, align);
slot = offset;
fanyang-mono marked this conversation as resolved.
Show resolved Hide resolved
offset += size;
}
Expand Down
8 changes: 0 additions & 8 deletions src/mono/mono/mini/mini.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,15 +303,7 @@ enum {
#define MONO_IS_REAL_MOVE(ins) (((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_FMOVE) || ((ins)->opcode == OP_XMOVE) || ((ins)->opcode == OP_RMOVE))
#define MONO_IS_ZERO(ins) (((ins)->opcode == OP_VZERO) || ((ins)->opcode == OP_XZERO))

#ifdef TARGET_ARM64
/*
* SIMD is only supported on arm64 when using the LLVM backend. When not using
* the LLVM backend, treat SIMD datatypes as regular value types.
*/
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && COMPILE_LLVM (cfg) && m_class_is_simd_type (klass))
#else
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && m_class_is_simd_type (klass) && (COMPILE_LLVM (cfg) || mono_type_size (m_class_get_byval_arg (klass), NULL) == 16))
#endif

#else

Expand Down
Loading