Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono] Add initial arm64 hardware intrinsics support for mini JIT #82420

Merged
merged 15 commits into from
Mar 6, 2023
Merged
14 changes: 11 additions & 3 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,19 @@ arm_encode_imm7 (int imm, int size)
#define arm_format_ldrfp_imm(p, size, opc, rt, rn, pimm, scale) arm_emit ((p), ((size) << 30) | (0xf << 26) | (0x1 << 24) | ((opc) << 22) | (arm_encode_pimm12 ((pimm), (scale)) << 10) | ((rn) << 5) | ((rt) << 0))

/* Load double */
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, dt, xn, simm, 8)
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, (dt), (xn), (simm), 8)
/* Load single */
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, dt, xn, simm, 4)
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, (dt), (xn), (simm), 4)
/* Load 128 bit */
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0, 0x3, qt, xn, simm, 16)
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0x0, 0x3, (qt), (xn), (simm), 16)

/* LDR (literal, SIMD&FP) PC-relative*/
/* Load single */
#define arm_neon_ldrs_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b00 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load double */
#define arm_neon_ldrd_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b01 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load 128 bit */
#define arm_neon_ldrq_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b10 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))

/* Arithmetic (immediate) */
static G_GNUC_UNUSED inline guint32
Expand Down
5 changes: 5 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ r8const: dest:f len:20
label: len:0
store_membase_imm: dest:b len:20
store_membase_reg: dest:b src1:i len:20
storex_membase: dest:b src1:x len:12
storei1_membase_imm: dest:b len:20
storei1_membase_reg: dest:b src1:i len:12
storei2_membase_imm: dest:b len:20
Expand All @@ -135,6 +136,7 @@ storei1_memindex: dest:b src1:i src2:i len:4
storei2_memindex: dest:b src1:i src2:i len:4
storei4_memindex: dest:b src1:i src2:i len:4
load_membase: dest:i src1:b len:20
loadx_membase: dest:x src1:b len:12
loadi1_membase: dest:i src1:b len:32
loadu1_membase: dest:i src1:b len:32
loadi2_membase: dest:i src1:b len:32
Expand Down Expand Up @@ -493,6 +495,9 @@ atomic_store_i8: dest:b src1:i len:20
atomic_store_u8: dest:b src1:i len:20
atomic_store_r4: dest:b src1:f len:28
atomic_store_r8: dest:b src1:f len:24
xbinop: dest:x src1:x src2:x len:4
xzero: dest:x len:4
xmove: dest:x src1:x len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
157 changes: 155 additions & 2 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,20 @@ emit_strfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_strfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_strfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -717,6 +731,20 @@ emit_ldrfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_ldrfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_ldrfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_ldrfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

guint8*
mono_arm_emit_ldrx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -2209,8 +2237,15 @@ mono_arch_allocate_vars (MonoCompile *cfg)
cfg->ret->dreg = cinfo->ret.reg;
break;
case ArgVtypeInIRegs:
case ArgHFA:
case ArgHFA: {
/* Allocate a local to hold the result, the epilog will copy it to the correct place */
MonoType *ret_type = mini_get_underlying_type (sig->ret);
MonoClass *klass = mono_class_from_mono_type_internal (ret_type);
if (MONO_CLASS_IS_SIMD (cfg, klass)) {
int align_simd = mono_type_size (m_class_get_byval_arg (klass), NULL);
offset = ALIGN_TO (offset, align_simd);
}

cfg->ret->opcode = OP_REGOFFSET;
cfg->ret->inst_basereg = cfg->frame_reg;
cfg->ret->inst_offset = offset;
Expand All @@ -2220,6 +2255,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
else
offset += 16;
break;
}
case ArgVtypeByRef:
/* This variable will be initialized in the prolog from R8 */
cfg->vret_addr->opcode = OP_REGOFFSET;
Expand Down Expand Up @@ -2377,7 +2413,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
ins->opcode = OP_REGOFFSET;
ins->inst_basereg = cfg->frame_reg;
ins->inst_offset = offset + offsets [i];
//printf ("allocated local %d to ", i); mono_print_tree_nl (ins);
//printf ("allocated local %d to ", i); mono_print_ins (ins);
}
}
offset += locals_stack_size;
Expand Down Expand Up @@ -3235,6 +3271,52 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
return code;
}

static int
get_vector_size_macro (MonoInst *ins)
{
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
return VREG_FULL;
case 8:
return VREG_LOW;
default:
g_assert_not_reached ();
}
}

static int
get_type_size_macro (MonoTypeEnum type)
{
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return TYPE_I8;
case MONO_TYPE_I2:
case MONO_TYPE_U2:
return TYPE_I16;
case MONO_TYPE_I4:
case MONO_TYPE_U4:
return TYPE_I32;
case MONO_TYPE_I8:
case MONO_TYPE_U8:
return TYPE_I64;
case MONO_TYPE_I:
case MONO_TYPE_U:
#if TARGET_SIZEOF_VOID_P == 8
return TYPE_I64;
#else
return TYPE_I32;
#endif
case MONO_TYPE_R4:
return TYPE_F32;
case MONO_TYPE_R8:
return TYPE_F64;
default:
g_assert_not_reached ();
}
}

void
mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
{
Expand Down Expand Up @@ -3412,6 +3494,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
case OP_LOADX_MEMBASE:
code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XMOVE:
arm_movw (code, dreg, sreg1);
break;
case OP_XCONST: {
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
}
break;
}

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3484,6 +3589,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_true_bb, MONO_R_ARM64_CBZ);
arm_cbnzx (code, sreg1, 0);
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IADD:
arm_neon_add (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_FADD:
arm_neon_fadd (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down Expand Up @@ -5265,6 +5382,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
size += 32;
exc_throw_found [i] = TRUE;
}
} else if (ji->type == MONO_PATCH_INFO_X128) {
size += 16 + 15; /* sizeof (Vector128<T>) + alignment */
}
}

Expand Down Expand Up @@ -5306,6 +5425,40 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
set_code_cursor (cfg, code);
}

/* Handle relocations with RIP relative addressing */
for (ji = cfg->patch_info; ji; ji = ji->next) {
gboolean remove = FALSE;

if (ji->type == MONO_PATCH_INFO_X128) {
guint8 *pos, *patch_pos;
guint32 target_pos;

code = (guint8*)ALIGN_TO (code, 16);
pos = cfg->native_code + ji->ip.i;
patch_pos = pos + 3;
target_pos = GPTRDIFF_TO_UINT32 (code - pos - 4);
memcpy (code, ji->data.target, 16);
code += 16;

*(guint32*)(patch_pos) = target_pos;
fanyang-mono marked this conversation as resolved.
Show resolved Hide resolved

remove = TRUE;
}

if (remove) {
if (ji == cfg->patch_info)
cfg->patch_info = ji->next;
else {
MonoJumpInfo *tmp;

for (tmp = cfg->patch_info; tmp->next != ji; tmp = tmp->next)
;
tmp->next = ji->next;
}
}
set_code_cursor (cfg, code);
}

set_code_cursor (cfg, code);
}

Expand Down
9 changes: 3 additions & 6 deletions src/mono/mono/mini/mini.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,18 +1502,15 @@ mono_allocate_stack_slots (MonoCompile *cfg, gboolean backward, guint32 *stack_s
* Align the size too so the code generated for passing vtypes in
* registers doesn't overwrite random locals.
*/
size = (size + (align - 1)) & ~(align -1);
size = ALIGN_TO (size, align);
}

if (backward) {
offset += size;
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset + size, align);
slot = offset;
}
else {
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset, align);
slot = offset;
fanyang-mono marked this conversation as resolved.
Show resolved Hide resolved
offset += size;
}
Expand Down
8 changes: 0 additions & 8 deletions src/mono/mono/mini/mini.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,15 +303,7 @@ enum {
#define MONO_IS_REAL_MOVE(ins) (((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_FMOVE) || ((ins)->opcode == OP_XMOVE) || ((ins)->opcode == OP_RMOVE))
#define MONO_IS_ZERO(ins) (((ins)->opcode == OP_VZERO) || ((ins)->opcode == OP_XZERO))

#ifdef TARGET_ARM64
/*
* SIMD is only supported on arm64 when using the LLVM backend. When not using
* the LLVM backend, treat SIMD datatypes as regular value types.
*/
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && COMPILE_LLVM (cfg) && m_class_is_simd_type (klass))
#else
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && m_class_is_simd_type (klass) && (COMPILE_LLVM (cfg) || mono_type_size (m_class_get_byval_arg (klass), NULL) == 16))
#endif

#else

Expand Down
Loading