Skip to content

Commit

Permalink
[Mono] Add initial arm64 hardware intrinsics support for mini JIT (#8…
Browse files Browse the repository at this point in the history
…2420)

* Initial change to make intrinsics work with mini JIT on arm64

* Fix alignment issue and add size getter functions

* Review feedback and refactor

* Fix align

* Fix issue with big offset

* Fix build warning

* Add intrinsics for get_One

* Add xconst to mdesc

* Fix MONO_PATCH_INFO_X128

* Stop the simd and fp registers sharing

* Stop v64 from emitting simd intrinsics

* Move between SIMD registers

* Adjust filter logic

* Keep the filter logic under non-llvm

* Uncomment
  • Loading branch information
fanyang-mono authored Mar 6, 2023
1 parent ef71bb7 commit d74dde7
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 23 deletions.
18 changes: 15 additions & 3 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,20 @@ arm_encode_imm7 (int imm, int size)
#define arm_format_ldrfp_imm(p, size, opc, rt, rn, pimm, scale) arm_emit ((p), ((size) << 30) | (0xf << 26) | (0x1 << 24) | ((opc) << 22) | (arm_encode_pimm12 ((pimm), (scale)) << 10) | ((rn) << 5) | ((rt) << 0))

/* Load double */
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, dt, xn, simm, 8)
#define arm_ldrfpx(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_X, 0x1, (dt), (xn), (simm), 8)
/* Load single */
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, dt, xn, simm, 4)
#define arm_ldrfpw(p, dt, xn, simm) arm_format_ldrfp_imm ((p), ARMSIZE_W, 0x1, (dt), (xn), (simm), 4)
/* Load 128 bit */
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0, 0x3, qt, xn, simm, 16)
#define arm_ldrfpq(p, qt, xn, simm) arm_format_ldrfp_imm ((p), 0x0, 0x3, (qt), (xn), (simm), 16)

/* LDR (literal, SIMD&FP) PC-relative*/
/* Load single */
#define arm_neon_ldrs_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b00 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load double */
#define arm_neon_ldrd_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b01 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
/* Load 128 bit */
#define arm_neon_ldrq_lit(p, rd, target) arm_emit ((p), 0b00011100000000000000000000000000 | (0b10 << 30) | (arm_get_disp19 ((p), (target)) << 5) | (rd))
#define arm_neon_ldrq_lit_fixup(p, target) *((guint32*)p) = (*((guint32*)p) & 0xff00001f) | (arm_get_disp19 ((p), (target)) << 5)

/* Arithmetic (immediate) */
static G_GNUC_UNUSED inline guint32
Expand Down Expand Up @@ -1000,6 +1009,9 @@ arm_encode_arith_imm (int imm, guint32 *shift)
#define TYPE_F32 0
#define TYPE_F64 1

/* NEON :: move SIMD register*/
#define arm_neon_mov(p, rd, rn) arm_neon_orr ((p), VREG_FULL, (rd), (rn), (rn))

/* NEON :: AES */
#define arm_neon_aes_opcode(p, size, opcode, rd, rn) arm_neon_opcode_2reg ((p), VREG_FULL, 0b00001110001010000000100000000000 | (size) << 22 | (opcode) << 12, (rd), (rn))
#define arm_neon_aese(p, rd, rn) arm_neon_aes_opcode ((p), 0b00, 0b00100, (rd), (rn))
Expand Down
6 changes: 6 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ r8const: dest:f len:20
label: len:0
store_membase_imm: dest:b len:20
store_membase_reg: dest:b src1:i len:20
storex_membase: dest:b src1:x len:12
storei1_membase_imm: dest:b len:20
storei1_membase_reg: dest:b src1:i len:12
storei2_membase_imm: dest:b len:20
Expand All @@ -135,6 +136,7 @@ storei1_memindex: dest:b src1:i src2:i len:4
storei2_memindex: dest:b src1:i src2:i len:4
storei4_memindex: dest:b src1:i src2:i len:4
load_membase: dest:i src1:b len:20
loadx_membase: dest:x src1:b len:12
loadi1_membase: dest:i src1:b len:32
loadu1_membase: dest:i src1:b len:32
loadi2_membase: dest:i src1:b len:32
Expand Down Expand Up @@ -493,6 +495,10 @@ atomic_store_i8: dest:b src1:i len:20
atomic_store_u8: dest:b src1:i len:20
atomic_store_r4: dest:b src1:f len:28
atomic_store_r8: dest:b src1:f len:24
xbinop: dest:x src1:x src2:x len:4
xzero: dest:x len:4
xmove: dest:x src1:x len:4
xconst: dest:x len:10

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
153 changes: 151 additions & 2 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,20 @@ emit_strfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_strfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_strfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_strx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -717,6 +731,20 @@ emit_ldrfpx (guint8 *code, int rt, int rn, int imm)
return code;
}

static WARN_UNUSED_RESULT guint8*
emit_ldrfpq (guint8 *code, int rt, int rn, int imm)
{
if (arm_is_pimm12_scaled (imm, 16)) {
arm_ldrfpq (code, rt, rn, imm);
} else {
g_assert (rn != ARMREG_IP0);
code = emit_imm (code, ARMREG_IP0, imm);
arm_addx (code, ARMREG_IP0, rn, ARMREG_IP0);
arm_ldrfpq (code, rt, ARMREG_IP0, 0);
}
return code;
}

guint8*
mono_arm_emit_ldrx (guint8 *code, int rt, int rn, int imm)
{
Expand Down Expand Up @@ -2209,8 +2237,15 @@ mono_arch_allocate_vars (MonoCompile *cfg)
cfg->ret->dreg = cinfo->ret.reg;
break;
case ArgVtypeInIRegs:
case ArgHFA:
case ArgHFA: {
/* Allocate a local to hold the result, the epilog will copy it to the correct place */
MonoType *ret_type = mini_get_underlying_type (sig->ret);
MonoClass *klass = mono_class_from_mono_type_internal (ret_type);
if (MONO_CLASS_IS_SIMD (cfg, klass)) {
int align_simd = mono_type_size (m_class_get_byval_arg (klass), NULL);
offset = ALIGN_TO (offset, align_simd);
}

cfg->ret->opcode = OP_REGOFFSET;
cfg->ret->inst_basereg = cfg->frame_reg;
cfg->ret->inst_offset = offset;
Expand All @@ -2220,6 +2255,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
else
offset += 16;
break;
}
case ArgVtypeByRef:
/* This variable will be initialized in the prolog from R8 */
cfg->vret_addr->opcode = OP_REGOFFSET;
Expand Down Expand Up @@ -2377,7 +2413,7 @@ mono_arch_allocate_vars (MonoCompile *cfg)
ins->opcode = OP_REGOFFSET;
ins->inst_basereg = cfg->frame_reg;
ins->inst_offset = offset + offsets [i];
//printf ("allocated local %d to ", i); mono_print_tree_nl (ins);
//printf ("allocated local %d to ", i); mono_print_ins (ins);
}
}
offset += locals_stack_size;
Expand Down Expand Up @@ -3235,6 +3271,52 @@ emit_branch_island (MonoCompile *cfg, guint8 *code, int start_offset)
return code;
}

static int
get_vector_size_macro (MonoInst *ins)
{
int size = mono_class_value_size (ins->klass, NULL);
switch (size) {
case 16:
return VREG_FULL;
case 8:
return VREG_LOW;
default:
g_assert_not_reached ();
}
}

static int
get_type_size_macro (MonoTypeEnum type)
{
switch (type) {
case MONO_TYPE_I1:
case MONO_TYPE_U1:
return TYPE_I8;
case MONO_TYPE_I2:
case MONO_TYPE_U2:
return TYPE_I16;
case MONO_TYPE_I4:
case MONO_TYPE_U4:
return TYPE_I32;
case MONO_TYPE_I8:
case MONO_TYPE_U8:
return TYPE_I64;
case MONO_TYPE_I:
case MONO_TYPE_U:
#if TARGET_SIZEOF_VOID_P == 8
return TYPE_I64;
#else
return TYPE_I32;
#endif
case MONO_TYPE_R4:
return TYPE_F32;
case MONO_TYPE_R8:
return TYPE_F64;
default:
g_assert_not_reached ();
}
}

void
mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
{
Expand Down Expand Up @@ -3412,6 +3494,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
case OP_LOADX_MEMBASE:
code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XMOVE:
arm_neon_mov (code, dreg, sreg1);
break;
case OP_XCONST: {
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
}
break;
}

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3484,6 +3589,18 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
mono_add_patch_info_rel (cfg, offset, MONO_PATCH_INFO_BB, ins->inst_true_bb, MONO_R_ARM64_CBZ);
arm_cbnzx (code, sreg1, 0);
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IADD:
arm_neon_add (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_FADD:
arm_neon_fadd (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down Expand Up @@ -5265,6 +5382,8 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
size += 32;
exc_throw_found [i] = TRUE;
}
} else if (ji->type == MONO_PATCH_INFO_X128) {
size += 16 + 15; /* sizeof (Vector128<T>) + alignment */
}
}

Expand Down Expand Up @@ -5306,6 +5425,36 @@ mono_arch_emit_exceptions (MonoCompile *cfg)
set_code_cursor (cfg, code);
}

/* Handle relocations with RIP relative addressing */
for (ji = cfg->patch_info; ji; ji = ji->next) {
gboolean remove = FALSE;

if (ji->type == MONO_PATCH_INFO_X128) {
guint8 *pos;

code = (guint8*)ALIGN_TO (code, 16);
pos = cfg->native_code + ji->ip.i;
arm_neon_ldrq_lit_fixup (pos, code);
memcpy (code, ji->data.target, 16);
code += 16;

remove = TRUE;
}

if (remove) {
if (ji == cfg->patch_info)
cfg->patch_info = ji->next;
else {
MonoJumpInfo *tmp;

for (tmp = cfg->patch_info; tmp->next != ji; tmp = tmp->next)
;
tmp->next = ji->next;
}
}
set_code_cursor (cfg, code);
}

set_code_cursor (cfg, code);
}

Expand Down
4 changes: 3 additions & 1 deletion src/mono/mono/mini/mini-arm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@

#if !defined(DISABLE_SIMD)
#define MONO_ARCH_SIMD_INTRINSICS 1
#define MONO_ARCH_NEED_SIMD_BANK 1
#define MONO_ARCH_USE_SHARED_FP_SIMD_BANK 1
#endif

#define MONO_CONTEXT_SET_LLVM_EXC_REG(ctx, exc) do { (ctx)->regs [0] = (gsize)exc; } while (0)
Expand Down Expand Up @@ -52,7 +54,7 @@
/* v8..v15 */
#define MONO_ARCH_CALLEE_SAVED_FREGS 0xff00

#define MONO_ARCH_CALLEE_SAVED_XREGS 0
#define MONO_ARCH_CALLEE_SAVED_XREGS MONO_ARCH_CALLEE_SAVED_FREGS

#define MONO_ARCH_CALLEE_XREGS MONO_ARCH_CALLEE_FREGS

Expand Down
9 changes: 3 additions & 6 deletions src/mono/mono/mini/mini.c
Original file line number Diff line number Diff line change
Expand Up @@ -1502,18 +1502,15 @@ mono_allocate_stack_slots (MonoCompile *cfg, gboolean backward, guint32 *stack_s
* Align the size too so the code generated for passing vtypes in
* registers doesn't overwrite random locals.
*/
size = (size + (align - 1)) & ~(align -1);
size = ALIGN_TO (size, align);
}

if (backward) {
offset += size;
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset + size, align);
slot = offset;
}
else {
offset += align - 1;
offset &= ~(align - 1);
offset = ALIGN_TO (offset, align);
slot = offset;
offset += size;
}
Expand Down
8 changes: 0 additions & 8 deletions src/mono/mono/mini/mini.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,15 +303,7 @@ enum {
#define MONO_IS_REAL_MOVE(ins) (((ins)->opcode == OP_MOVE) || ((ins)->opcode == OP_FMOVE) || ((ins)->opcode == OP_XMOVE) || ((ins)->opcode == OP_RMOVE))
#define MONO_IS_ZERO(ins) (((ins)->opcode == OP_VZERO) || ((ins)->opcode == OP_XZERO))

#ifdef TARGET_ARM64
/*
* SIMD is only supported on arm64 when using the LLVM backend. When not using
* the LLVM backend, treat SIMD datatypes as regular value types.
*/
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && COMPILE_LLVM (cfg) && m_class_is_simd_type (klass))
#else
#define MONO_CLASS_IS_SIMD(cfg, klass) (((cfg)->opt & MONO_OPT_SIMD) && m_class_is_simd_type (klass) && (COMPILE_LLVM (cfg) || mono_type_size (m_class_get_byval_arg (klass), NULL) == 16))
#endif

#else

Expand Down
Loading

0 comments on commit d74dde7

Please sign in to comment.