From e31f305aa4da478682506e8a4c578dd1c9dbd969 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:00:06 -0700 Subject: [PATCH 01/24] Update ld1 in instrsarm64.h --- src/coreclr/src/jit/instrsarm64.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 2eadae317324f..49fcd6e49dd9a 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -85,6 +85,15 @@ INST6(sub, "sub", 0, 0, IF_EN6A, 0x4B000000, 0x4B000000, 0x4B200000, // sub Vd,Vn,Vm DV_3A 0Q101110XX1mmmmm 100001nnnnnddddd 2E20 8400 Vd,Vn,Vm (vector) // sub Vd,Vn,Vm DV_3E 01111110111mmmmm 100001nnnnnddddd 7EE0 8400 Vd,Vn,Vm (scalar) +// enum name FP LD/ST LS_2D LS_3F LS_2E LS_2F LS_3G LS_2G +INST6(ld1, "ld1", 0, LD, IF_EN6B, 0x0C407000, 0x0CC07000, 0x0CDF7000, 0x0D400000, 0x0DC00000, 0x0DDF0000) + // ld1 {Vt},[Xn] LS_2D 0Q00110001000000 0111ssnnnnnttttt 0C40 7000 base register + // ld1 {Vt},[Xn],Xm LS_3F 0Q001100110mmmmm 0111ssnnnnnttttt 0CC0 7000 post-indexed by a register + // ld1 {Vt},[Xn],#imm LS_2E 0Q00110011011111 0111ssnnnnnttttt 0CDF 7000 post-indexed by an immediate + // ld1 {Vt}[],[Xn] LS_2F 0Q00110101000000 xx0Sssnnnnnttttt 0D40 0000 base register + // ld1 {Vt}[],[Xn],Xm LS_3G 0Q001101110mmmmm xx0Sssnnnnnttttt 0DC0 0000 post-indexed by a register + // ld1 {Vt}[],[Xn],#imm LS_2G 0Q00110111011111 xx0Sssnnnnnttttt 0DDF 0000 post-indexed by an immediate + // enum name FP LD/ST LS_2A LS_2B LS_2C LS_3A LS_1A INST5(ldr, "ldr", 0,LD, IF_EN5A, 0xB9400000, 0xB9400000, 0xB8400000, 0xB8600800, 0x18000000) // ldr Rt,[Xn] LS_2A 1X11100101000000 000000nnnnnttttt B940 0000 @@ -263,13 +272,6 @@ INST4(fcmgt, "fcmgt", 0, 0, IF_EN4I, 0x7EA0E400, 0x2EA0E400, 0x5EA0C800, // fcmgt Vd,Vn DV_2G 010111101X100000 110010nnnnnddddd 5EA0 E800 Vd Vn (scalar) // fcmgt Vd,Vn DV_2A 0Q0011101X100000 110010nnnnnddddd 0EA0 C800 Vd Vn (vector) -// enum name FP LD/ST LS_2D LS_3F LS_2E LS_3G -INST4(ld1, "ld1", 0, LD,IF_EN4J, 0x0C402000, 0x0CC02000, 0x0D400000, 0x0DC00000) - // ld1 Vd,Rn LS_2D 0Q00110001000000 xx1xssnnnnnttttt 0C40 2000 Vd,Rn (vector - multiple structures) - // ld1 Vd,Rn,Rm LS_3F 0Q001100110mmmmm xx1xssnnnnnttttt 0CC0 2000 Vd,Rn,Rm (vector - multiple structures) - // ld1 Vd[],Rn LS_2E 0Q00110101000000 xx0Sssnnnnnttttt 0D40 0000 Vd[],Rn (vector - single structure) - // ld1 Vd[],Rn,Rm LS_3G 0Q001101110mmmmm xx0Sssnnnnnttttt 0DC0 0000 Vd[],Rn,Rm (vector - single structure) - // enum name FP LD/ST DR_3A DR_3B DI_2C INST3(ands, "ands", 0, 0, IF_EN3A, 0x6A000000, 0x6A000000, 0x72000000) // ands Rd,Rn,Rm DR_3A X1101010000mmmmm 000000nnnnnddddd 6A00 0000 From cddcb09276d2aa8e8335fdf5db7fa6bfdf16e63d Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:04:29 -0700 Subject: [PATCH 02/24] Add ld2, ld3, ld4, st1, st2, st3, st4 in instrsarm64.h --- src/coreclr/src/jit/instrsarm64.h | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 49fcd6e49dd9a..41ae8d243d923 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -94,6 +94,62 @@ INST6(ld1, "ld1", 0, LD, IF_EN6B, 0x0C407000, 0x0CC07000, 0x0CDF7000, // ld1 {Vt}[],[Xn],Xm LS_3G 0Q001101110mmmmm xx0Sssnnnnnttttt 0DC0 0000 post-indexed by a register // ld1 {Vt}[],[Xn],#imm LS_2G 0Q00110111011111 xx0Sssnnnnnttttt 0DDF 0000 post-indexed by an immediate +INST6(ld2, "ld2", 0, LD, IF_EN6B, 0x0C408000, 0x0CC08000, 0x0CDF8000, 0x0D600000, 0x0DE00000, 0x0DFF0000) + // ld2 {Vt,Vt2},[Xn] LS_2D 0Q00110001000000 1000ssnnnnnttttt 0C40 8000 base register + // ld2 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100110mmmmm 1000ssnnnnnttttt 0CC0 8000 post-indexed by a register + // ld2 {Vt,Vt2},[Xn],#imm LS_2E 0Q001100110mmmmm 1000ssnnnnnttttt 0CDF 8000 post-indexed by an immediate + // ld2 {Vt,Vt2}[],[Xn] LS_2F 0Q00110101100000 xx0Sssnnnnnttttt 0D60 0000 base register + // ld2 {Vt,Vt2}[],[Xn],Xm LS_3G 0Q001101111mmmmm xx0Sssnnnnnttttt 0DE0 0000 post-indexed by a register + // ld2 {Vt,Vt2}[],[Xn],#imm LS_2G 0Q00110111111111 xx0Sssnnnnnttttt 0DFF 0000 post-indexed by an immediate + +INST6(ld3, "ld3", 0, LD, IF_EN6B, 0x0C404000, 0x0CC04000, 0x0CDF4000, 0x0D402000, 0x0DC02000, 0x0DDF2000) + // ld3 {Vt-Vt3},[Xn] LS_2D 0Q00110001000000 0100ssnnnnnttttt 0C40 4000 base register + // ld3 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100110mmmmm 0100ssnnnnnttttt 0CC0 4000 post-indexed by a register + // ld3 {Vt-Vt3},[Xn],#imm LS_2E 0Q001100110mmmmm 0100ssnnnnnttttt 0CDF 4000 post-indexed by an immediate + // ld3 {Vt-Vt3}[],[Xn] LS_2F 0Q00110101000000 xx1Sssnnnnnttttt 0D40 2000 base register + // ld3 {Vt-Vt3}[],[Xn],Xm LS_3G 0Q001101110mmmmm xx1Sssnnnnnttttt 0DC0 2000 post-indexed by a register + // ld3 {Vt-Vt3}[],[Xn],#imm LS_2G 0Q00110111011111 xx1Sssnnnnnttttt 0DDF 2000 post-indexed by an immediate + +INST6(ld4, "ld4", 0, LD, IF_EN6B, 0x0C400000, 0x0CC00000, 0x0CDF0000, 0x0D602000, 0x0DE02000, 0x0DFF2000) + // ld4 {Vt-Vt4},[Xn] LS_2D 0Q00110001000000 0000ssnnnnnttttt 0C40 0000 base register + // ld4 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100110mmmmm 0000ssnnnnnttttt 0CC0 0000 post-indexed by a register + // ld4 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110011011111 0000ssnnnnnttttt 0CDF 0000 post-indexed by an immediate + // ld4 {Vt-Vt4}[],[Xn] LS_2F 0Q00110101100000 xx1Sssnnnnnttttt 0D60 2000 base register + // ld4 {Vt-Vt4}[],[Xn],Xm LS_3G 0Q001101111mmmmm xx1Sssnnnnnttttt 0DE0 2000 post-indexed by a register + // ld4 {Vt-Vt4}[],[Xn],#imm LS_2G 0Q00110111111111 xx1Sssnnnnnttttt 0DFF 2000 post-indexed by an immediate + +INST6(st1, "st1", 0, LD, IF_EN6B, 0x0C007000, 0x0C807000, 0x0C9F7000, 0x0D000000, 0x0D800000, 0x0D9F0000) + // st1 {Vt},[Xn] LS_2D 0Q00110000000000 0111ssnnnnnttttt 0C00 7000 base register + // st1 {Vt},[Xn],Xm LS_3F 0Q001100100mmmmm 0111ssnnnnnttttt 0C80 7000 post-indexed by a register + // st1 {Vt},[Xn],#imm LS_2E 0Q00110010011111 0111ssnnnnnttttt 0C9F 7000 post-indexed by an immediate + // st1 {Vt}[],[Xn] LS_2F 0Q00110100000000 xx0Sssnnnnnttttt 0D00 0000 base register + // st1 {Vt}[],[Xn],Xm LS_3G 0Q001101100mmmmm xx0Sssnnnnnttttt 0D80 0000 post-indexed by a register + // st1 {Vt}[],[Xn],#imm LS_2G 0Q00110110011111 xx0Sssnnnnnttttt 0D9F 0000 post-indexed by an immediate + +INST6(st2, "st2", 0, ST, IF_EN6B, 0x0C008000, 0x0C808000, 0x0C9F8000, 0x0D200000, 0x0DA00000, 0x0DBF0000) + // st2 {Vt,Vt2},[Xn] LS_2D 0Q00110000000000 1000ssnnnnnttttt 0C00 8000 base register + // st2 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100100mmmmm 1000ssnnnnnttttt 0C80 8000 post-indexed by a register + // st2 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110010011111 1000ssnnnnnttttt 0C9F 8000 post-indexed by an immediate + // st2 {Vt,Vt2}[],[Xn] LS_2F 0Q00110100100000 xx0Sssnnnnnttttt 0D20 0000 base register + // st2 {Vt,Vt2}[],[Xn],Xm LS_3G 0Q001101101mmmmm xx0Sssnnnnnttttt 0DA0 0000 post-indexed by a register + // st2 {Vt,Vt2}[],[Xn],#imm LS_2G 0Q00110110111111 xx0Sssnnnnnttttt 0DBF 0000 post-indexed by an immediate + +INST6(st3, "st3", 0, ST, IF_EN6B, 0x0C004000, 0x0C804000, 0x0C9F4000, 0x0D002000, 0x0D802000, 0x0D9F2000) + // st3 {Vt-Vt3},[Xn] LS_2D 0Q00110000000000 0100ssnnnnnttttt 0C00 4000 base register + // st3 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100100mmmmm 0100ssnnnnnttttt 0C80 4000 post-indexed by a register + // st3 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110010011111 0100ssnnnnnttttt 0C9F 4000 post-indexed by an immediate + // st3 {Vt-Vt3}[],[Xn] LS_2F 0Q00110100000000 xx1Sssnnnnnttttt 0D00 2000 base register + // st3 {Vt-Vt3}[],[Xn],Xm LS_3G 0Q001101100mmmmm xx1Sssnnnnnttttt 0D80 2000 post-indexed by a register + // st3 {Vt-Vt3}[],[Xn],#imm LS_2G 0Q00110110011111 xx1Sssnnnnnttttt 0D9F 2000 post-indexed by an immediate + +INST6(st4, "st4", 0, ST, IF_EN6B, 0x0C000000, 0x0C800000, 0x0C9F0000, 0x0D202000, 0x0DA02000, 0x0DBF2000) + // st4 {Vt-Vt4},[Xn] LS_2D 0Q00110000000000 0000ssnnnnnttttt 0C00 0000 base register + // st4 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100100mmmmm 0000ssnnnnnttttt 0C80 0000 post-indexed by a register + // st4 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110010011111 0000ssnnnnnttttt 0C9F 0000 post-indexed by an immediate + // st4 {Vt-Vt4}[],[Xn] LS_2F 0Q00110100100000 xx1Sssnnnnnttttt 0D20 2000 base register + // st4 {Vt-Vt4}[],[Xn],Xm LS_3G 0Q001101101mmmmm xx1Sssnnnnnttttt 0DA0 2000 post-indexed by a register + // st4 {Vt-Vt4}[],[Xn],#imm LS_2G 0Q00110110111111 xx1Sssnnnnnttttt 0DBF 2000 post-indexed by an immediate + // enum name FP LD/ST LS_2A LS_2B LS_2C LS_3A LS_1A INST5(ldr, "ldr", 0,LD, IF_EN5A, 0xB9400000, 0xB9400000, 0xB8400000, 0xB8600800, 0x18000000) // ldr Rt,[Xn] LS_2A 1X11100101000000 000000nnnnnttttt B940 0000 From d11a9568b881a015535df8530079fc91fe95ef20 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:05:53 -0700 Subject: [PATCH 03/24] Add ld1, st1 operating on multiple registers in instrsarm64.h --- src/coreclr/src/jit/instrsarm64.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 41ae8d243d923..335ead82927e4 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -436,6 +436,36 @@ INST3(mvn, "mvn", 0, 0, IF_EN3I, 0x2A2003E0, 0x2A2003E0, 0x2E205800) // mvn Rd,(Rm,shk,imm) DR_2F X0101010sh1mmmmm iiiiii11111ddddd 2A20 03E0 Rm {LSL,LSR,ASR} imm(0-63) // mvn Vd,Vn DV_2M 0Q10111000100000 010110nnnnnddddd 2E20 5800 Vd,Vn (vector) +// enum name FP LD/ST LS_2D LS_3F LS_2E +INST3(ld1_2regs,"ld1", 0,LD, IF_EN3J, 0x0C40A000, 0x0CC0A000, 0x0CDFA000) + // ld1 {Vt,Vt2},[Xn] LS_2D 0Q00110001000000 1010ssnnnnnttttt 0C40 A000 base register + // ld1 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100110mmmmm 1010ssnnnnnttttt 0CC0 A000 post-indexed by a register + // ld1 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110011011111 1010ssnnnnnttttt 0CDF A000 post-indexed by an immediate + +INST3(ld1_3regs,"ld1", 0,LD, IF_EN3J, 0x0C406000, 0x0CC06000, 0x0CDF6000) + // ld1 {Vt-Vt3},[Xn] LS_2D 0Q00110001000000 0110ssnnnnnttttt 0C40 6000 base register + // ld1 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100110mmmmm 0110ssnnnnnttttt 0CC0 6000 post-indexed by a register + // ld1 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110011011111 0110ssnnnnnttttt 0CDF 6000 post-indexed by an immediate + +INST3(ld1_4regs,"ld1", 0,LD, IF_EN3J, 0x0C402000, 0x0CC02000, 0x0CDF2000) + // ld1 {Vt-Vt4},[Xn] LS_2D 0Q00110001000000 0010ssnnnnnttttt 0C40 2000 base register + // ld1 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100110mmmmm 0010ssnnnnnttttt 0CC0 2000 post-indexed by a register + // ld1 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110011011111 0010ssnnnnnttttt 0CDF 2000 post-indexed by an immediate + +INST3(st1_2regs,"st1", 0,ST, IF_EN3J, 0x0C00A000, 0x0C80A000, 0x0C9FA000) + // st1 {Vt,Vt2},[Xn] LS_2D 0Q00110000000000 1010ssnnnnnttttt 0C00 A000 base register + // st1 {Vt,Vt2},[Xn],Xm LS_3F 0Q001100100mmmmm 1010ssnnnnnttttt 0C80 A000 post-indexed by a register + // st1 {Vt,Vt2},[Xn],#imm LS_2E 0Q00110010011111 1010ssnnnnnttttt 0C9F A000 post-indexed by an immediate + +INST3(st1_3regs,"st1", 0,ST, IF_EN3J, 0x0C006000, 0x0C806000, 0x0C9F6000) + // st1 {Vt-Vt3},[Xn] LS_2D 0Q00110000000000 0110ssnnnnnttttt 0C00 6000 base register + // st1 {Vt-Vt3},[Xn],Xm LS_3F 0Q001100100mmmmm 0110XXnnnnnttttt 0C80 6000 post-indexed by a register + // st1 {Vt-Vt3},[Xn],#imm LS_2E 0Q00110010011111 0110XXnnnnnttttt 0C9F 6000 post-indexed by an immediate + +INST3(st1_4regs,"st1", 0,ST, IF_EN3J, 0x0C002000, 0x0C802000, 0x0C9F2000) + // st1 {Vt-Vt4},[Xn] LS_2D 0Q00110000000000 0010XXnnnnnttttt 0C00 2000 base register + // st1 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100100mmmmm 0010XXnnnnnttttt 0C80 2000 post-indexed by a register + // st1 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110010011111 0010XXnnnnnttttt 0C9F 2000 post-indexed by an immediate // enum name FP LD/ST DR_2E DR_2F INST2(negs, "negs", 0, 0, IF_EN2A, 0x6B0003E0, 0x6B0003E0) From 66cb52e3e8a89bd6268be7551c56ab414c17623e Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:06:30 -0700 Subject: [PATCH 04/24] Add ld1r, ld2r, ld3r, ld4r in instrsarm64.h --- src/coreclr/src/jit/instrsarm64.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 335ead82927e4..6958660ac68a3 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -467,6 +467,26 @@ INST3(st1_4regs,"st1", 0,ST, IF_EN3J, 0x0C002000, 0x0C802000, 0x0C9F2000) // st1 {Vt-Vt4},[Xn],Xm LS_3F 0Q001100100mmmmm 0010XXnnnnnttttt 0C80 2000 post-indexed by a register // st1 {Vt-Vt4},[Xn],#imm LS_2E 0Q00110010011111 0010XXnnnnnttttt 0C9F 2000 post-indexed by an immediate +INST3(ld1r, "ld1r", 0,LD, IF_EN3J, 0x0D40C000, 0x0DC0C000, 0x0DDFC000) + // ld1r {Vt},[Xn] LS_2D 0Q00110101000000 1100ssnnnnnttttt 0D40 C000 base register + // ld1r {Vt},[Xn],Xm LS_3F 0Q001101110mmmmm 1100ssnnnnnttttt 0DC0 C000 post-indexed by a register + // ld1r {Vt},[Xn],#1 LS_2E 0Q00110111011111 1100ssnnnnnttttt 0DDF C000 post-indexed by an immediate + +INST3(ld2r, "ld2r", 0,LD, IF_EN3J, 0x0D60C000, 0x0DE0C000, 0x0DFFC000) + // ld2r {Vt,Vt2},[Xn] LS_2D 0Q00110101100000 1100ssnnnnnttttt 0D60 C000 base register + // ld2r {Vt,Vt2},[Xn],Xm LS_3F 0Q001101111mmmmm 1100ssnnnnnttttt 0DE0 C000 post-indexed by a register + // ld2r {Vt,Vt2},[Xn],#2 LS_2E 0Q00110111111111 1100ssnnnnnttttt 0DFF C000 post-indexed by an immediate + +INST3(ld3r, "ld3r", 0,LD, IF_EN3J, 0x0D40E000, 0x0DC0E000, 0x0DDFE000) + // ld3r {Vt-Vt3},[Xn] LS_2D 0Q00110101000000 1110ssnnnnnttttt 0D40 E000 base register + // ld3r {Vt-Vt3},[Xn],Xm LS_3F 0Q001101110mmmmm 1110ssnnnnnttttt 0DC0 E000 post-indexed by a register + // ld3r {Vt-Vt3},[Xn],#4 LS_2E 0Q00110111011111 1110ssnnnnnttttt 0DDF E000 post-indexed by an immediate + +INST3(ld4r, "ld4r", 0,LD, IF_EN3J, 0x0D60E000, 0x0DE0E000, 0x0DFFE000) + // ld4r {Vt-Vt4},[Xn] LS_2D 0Q00110101100000 1110ssnnnnnttttt 0D60 E000 base register + // ld4r {Vt-Vt4},[Xn],Xm LS_3F 0Q001101111mmmmm 1110ssnnnnnttttt 0DE0 E000 post-indexed by a register + // ld4r {Vt-Vt4},[Xn],#8 LS_2E 0Q00110111111111 1110ssnnnnnttttt 0DFF E000 post-indexed by an immediate + // enum name FP LD/ST DR_2E DR_2F INST2(negs, "negs", 0, 0, IF_EN2A, 0x6B0003E0, 0x6B0003E0) // negs Rd,Rm DR_2E X1101011000mmmmm 00000011111ddddd 6B00 03E0 From d591c2b3b7a471f5cdb9b23c592e92739dea4c0c Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:12:49 -0700 Subject: [PATCH 05/24] Remove EN4J, add EN6B and EN3J in emitarm64.cpp emitfmtsarm64.h --- src/coreclr/src/jit/emitarm64.cpp | 36 +++++++++++++++++++---------- src/coreclr/src/jit/emitfmtsarm64.h | 3 ++- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 49b7b46b7cde9..8d11f8c0bd879 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -1533,6 +1533,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode9[9] = {IF_DR_2E, IF_DR_2G, IF_DI_1B, IF_DI_1D, IF_DV_3C, IF_DV_2B, IF_DV_2C, IF_DV_2E, IF_DV_2F}; const static insFormat formatEncode6A[6] = {IF_DR_3A, IF_DR_3B, IF_DR_3C, IF_DI_2A, IF_DV_3A, IF_DV_3E}; + const static insFormat formatEncode6B[6] = {IF_LS_2D, IF_LS_3F, IF_LS_2E, IF_LS_2F, IF_LS_3G, IF_LS_2G}; const static insFormat formatEncode5A[5] = {IF_LS_2A, IF_LS_2B, IF_LS_2C, IF_LS_3A, IF_LS_1A}; const static insFormat formatEncode5B[5] = {IF_DV_2G, IF_DV_2H, IF_DV_2I, IF_DV_1A, IF_DV_1B}; const static insFormat formatEncode5C[5] = {IF_DR_3A, IF_DR_3B, IF_DI_2C, IF_DV_3C, IF_DV_1B}; @@ -1545,7 +1546,6 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode4G[4] = {IF_DR_2E, IF_DR_2F, IF_DV_2M, IF_DV_2L}; const static insFormat formatEncode4H[4] = {IF_DV_3E, IF_DV_3A, IF_DV_2L, IF_DV_2M}; const static insFormat formatEncode4I[4] = {IF_DV_3D, IF_DV_3B, IF_DV_2G, IF_DV_2A}; - const static insFormat formatEncode4J[4] = {IF_LS_2D, IF_LS_3F, IF_LS_2E, IF_LS_3G}; const static insFormat formatEncode3A[3] = {IF_DR_3A, IF_DR_3B, IF_DI_2C}; const static insFormat formatEncode3B[3] = {IF_DR_2A, IF_DR_2B, IF_DI_1C}; const static insFormat formatEncode3C[3] = {IF_DR_3A, IF_DR_3B, IF_DV_3C}; @@ -1555,6 +1555,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) const static insFormat formatEncode3G[3] = {IF_DV_2A, IF_DV_2G, IF_DV_2I}; const static insFormat formatEncode3H[3] = {IF_DR_3A, IF_DV_3A, IF_DV_3AI}; const static insFormat formatEncode3I[3] = {IF_DR_2E, IF_DR_2F, IF_DV_2M}; + const static insFormat formatEncode3J[3] = {IF_LS_2D, IF_LS_3F, IF_LS_2E}; const static insFormat formatEncode2A[2] = {IF_DR_2E, IF_DR_2F}; const static insFormat formatEncode2B[2] = {IF_DR_3A, IF_DR_3B}; const static insFormat formatEncode2C[2] = {IF_DR_3A, IF_DI_2D}; @@ -1602,6 +1603,17 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; + case IF_EN6B: + for (index = 0; index < 6; index++) + { + if (fmt == formatEncode6B[index]) + { + encoding_found = true; + break; + } + } + break; + case IF_EN5A: for (index = 0; index < 5; index++) { @@ -1734,17 +1746,6 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; - case IF_EN4J: - for (index = 0; index < 4; index++) - { - if (fmt == formatEncode4J[index]) - { - encoding_found = true; - break; - } - } - break; - case IF_EN3A: for (index = 0; index < 3; index++) { @@ -1844,6 +1845,17 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } break; + case IF_EN3J: + for (index = 0; index < 3; index++) + { + if (fmt == formatEncode3J[index]) + { + encoding_found = true; + break; + } + } + break; + case IF_EN2A: for (index = 0; index < 2; index++) { diff --git a/src/coreclr/src/jit/emitfmtsarm64.h b/src/coreclr/src/jit/emitfmtsarm64.h index f8c52fea3a0dc..9802c76f2bdba 100644 --- a/src/coreclr/src/jit/emitfmtsarm64.h +++ b/src/coreclr/src/jit/emitfmtsarm64.h @@ -48,6 +48,7 @@ IF_DEF(LARGELDC, IS_NONE, JMP) // large constant pseudo-op (adrp + ldr) IF_DEF(EN9, IS_NONE, NONE) // Instruction has 9 possible encoding types IF_DEF(EN6A, IS_NONE, NONE) // Instruction has 6 possible encoding types, type A +IF_DEF(EN6B, IS_NONE, NONE) // Instruction has 6 possible encoding types, type B IF_DEF(EN5A, IS_NONE, NONE) // Instruction has 5 possible encoding types, type A IF_DEF(EN5B, IS_NONE, NONE) // Instruction has 5 possible encoding types, type B IF_DEF(EN5C, IS_NONE, NONE) // Instruction has 5 possible encoding types, type C @@ -60,7 +61,6 @@ IF_DEF(EN4F, IS_NONE, NONE) // Instruction has 4 possible encoding types, type F IF_DEF(EN4G, IS_NONE, NONE) // Instruction has 4 possible encoding types, type G IF_DEF(EN4H, IS_NONE, NONE) // Instruction has 4 possible encoding types, type H IF_DEF(EN4I, IS_NONE, NONE) // Instruction has 4 possible encoding types, type I -IF_DEF(EN4J, IS_NONE, NONE) // Instruction has 4 possible encoding types, type J IF_DEF(EN3A, IS_NONE, NONE) // Instruction has 3 possible encoding types, type A IF_DEF(EN3B, IS_NONE, NONE) // Instruction has 3 possible encoding types, type B IF_DEF(EN3C, IS_NONE, NONE) // Instruction has 3 possible encoding types, type C @@ -70,6 +70,7 @@ IF_DEF(EN3F, IS_NONE, NONE) // Instruction has 3 possible encoding types, type F IF_DEF(EN3G, IS_NONE, NONE) // Instruction has 3 possible encoding types, type G IF_DEF(EN3H, IS_NONE, NONE) // Instruction has 3 possible encoding types, type H IF_DEF(EN3I, IS_NONE, NONE) // Instruction has 3 possible encoding types, type I +IF_DEF(EN3J, IS_NONE, NONE) // Instruction has 3 possible encoding types, type J IF_DEF(EN2A, IS_NONE, NONE) // Instruction has 2 possible encoding types, type A IF_DEF(EN2B, IS_NONE, NONE) // Instruction has 2 possible encoding types, type B IF_DEF(EN2C, IS_NONE, NONE) // Instruction has 2 possible encoding types, type C From b5594035a740825f0e0fb5dbafe99faa1b45239d Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 9 Mar 2020 20:23:34 -0700 Subject: [PATCH 06/24] Update LS_2D, LS_2E, LS_3F, LS_3G and add LS_2F, LS_2G in emitfmtsarm64.h --- src/coreclr/src/jit/emitfmtsarm64.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/coreclr/src/jit/emitfmtsarm64.h b/src/coreclr/src/jit/emitfmtsarm64.h index 9802c76f2bdba..c0fd69edc05e5 100644 --- a/src/coreclr/src/jit/emitfmtsarm64.h +++ b/src/coreclr/src/jit/emitfmtsarm64.h @@ -134,15 +134,20 @@ IF_DEF(LS_1A, IS_NONE, JMP) // LS_1A XX...V..iiiiiiii iiiiiiiiiiittttt R IF_DEF(LS_2A, IS_NONE, NONE) // LS_2A .X.......X...... ......nnnnnttttt Rt Rn IF_DEF(LS_2B, IS_NONE, NONE) // LS_2B .X.......Xiiiiii iiiiiinnnnnttttt Rt Rn imm(0-4095) IF_DEF(LS_2C, IS_NONE, NONE) // LS_2C .X.......X.iiiii iiiiP.nnnnnttttt Rt Rn imm(-256..+255) pre/post inc -IF_DEF(LS_2D, IS_NONE, NONE) // LS_2D .Q.............. xx.xssnnnnnttttt Rn Vt -IF_DEF(LS_2E, IS_NONE, NONE) // LS_2E .Q.............. xx.Sssnnnnnttttt Rn Vt[] +IF_DEF(LS_2D, IS_NONE, NONE) // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn Load/Store multiple structures base register + // Load single structure and replicate base register +IF_DEF(LS_2E, IS_NONE, NONE) // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn Load/Store multiple structures post-indexed by an immediate + // Load single structure and replicate post-indexed by an immediate +IF_DEF(LS_2F, IS_NONE, NONE) // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn Load/Store single structure base register +IF_DEF(LS_2G, IS_NONE, NONE) // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn Load/Store single structure post-indexed by an immediate IF_DEF(LS_3A, IS_NONE, NONE) // LS_3A .X.......X.mmmmm xxxS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} IF_DEF(LS_3B, IS_NONE, NONE) // LS_3B X............... .aaaaannnnnddddd Rd Ra Rn IF_DEF(LS_3C, IS_NONE, NONE) // LS_3C X.........iiiiii iaaaaannnnnddddd Rd Ra Rn imm(im7,sh) IF_DEF(LS_3D, IS_NONE, NONE) // LS_3D .X.......X.mmmmm ......nnnnnttttt Wm Rt Rn IF_DEF(LS_3E, IS_NONE, NONE) // LS_3E .X.........mmmmm ......nnnnnttttt Rm Rt Rn ARMv8.1 LSE Atomics -IF_DEF(LS_3F, IS_NONE, NONE) // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Rm Rn Vt -IF_DEF(LS_3G, IS_NONE, NONE) // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Rm Rn Vt[] +IF_DEF(LS_3F, IS_NONE, NONE) // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register +IF_DEF(LS_3G, IS_NONE, NONE) // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm Load/Store single structure post-indexed by a register IF_DEF(DI_1A, IS_NONE, NONE) // DI_1A X.......shiiiiii iiiiiinnnnn..... Rn imm(i12,sh) IF_DEF(DI_1B, IS_NONE, NONE) // DI_1B X........hwiiiii iiiiiiiiiiiddddd Rd imm(i16,hw) From e8524fe2e0b3e9d7ff11d8a06cfe6532ca94a8e3 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Mon, 2 Mar 2020 18:50:49 -0800 Subject: [PATCH 07/24] Add Arm64 emitter unit tests for "Load/Store Vector" instructions in codegenarm64.cpp --- src/coreclr/src/jit/codegenarm64.cpp | 720 +++++++++++++++++++++++++++ 1 file changed, 720 insertions(+) diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index da26c13433e4a..f227fc364107a 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -5219,6 +5219,726 @@ void CodeGen::genArm64EmitterUnitTests() #endif // ALL_ARM64_EMITTER_UNIT_TESTS +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_2regs, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_3regs, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1_4regs, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld2, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld2, EA_16BYTE, REG_V18, REG_R20, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld3, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld3, EA_16BYTE, REG_V24, REG_R27, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld4, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld4, EA_16BYTE, REG_V30, REG_R2, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_2regs, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_3regs, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_st1_4regs, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st2, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st2, EA_16BYTE, REG_V18, REG_R20, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st3, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st3, EA_16BYTE, REG_V24, REG_R27, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_st4, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_st4, EA_16BYTE, REG_V30, REG_R2, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V2, REG_R3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V4, REG_R5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V6, REG_R7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V8, REG_R9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V10, REG_R11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld1r, EA_16BYTE, REG_V14, REG_R15, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V3, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V6, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V9, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V12, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V15, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld2r, EA_8BYTE, REG_V18, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld2r, EA_16BYTE, REG_V21, REG_R23, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V4, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V8, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V12, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V16, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V20, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld3r, EA_8BYTE, REG_V24, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld3r, EA_16BYTE, REG_V28, REG_SP, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP] + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V5, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V10, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V15, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V20, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V25, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_ld4r, EA_8BYTE, REG_V30, REG_R2, INS_OPTS_1D); + theEmitter->emitIns_R_R(INS_ld4r, EA_16BYTE, REG_V3, REG_R7, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_2regs, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_3regs, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1_4regs, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld2, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld2, EA_16BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld3, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld3, EA_16BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld4, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld4, EA_16BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_2regs, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_3regs, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_st1_4regs, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st2, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st2, EA_16BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st3, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st3, EA_16BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_st4, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_st4, EA_16BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, REG_R2, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V3, REG_R4, REG_R5, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V6, REG_R7, REG_R8, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V9, REG_R10, REG_R11, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, REG_R14, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V15, REG_R16, REG_R17, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_8BYTE, REG_V18, REG_R19, REG_R20, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld1r, EA_16BYTE, REG_V21, REG_R22, REG_R23, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, REG_R3, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V4, REG_R6, REG_R7, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V8, REG_R10, REG_R11, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V12, REG_R14, REG_R15, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V16, REG_R18, REG_R19, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V20, REG_R22, REG_R23, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_8BYTE, REG_V24, REG_R26, REG_R27, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld2r, EA_16BYTE, REG_V28, REG_SP, REG_R30, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, REG_R4, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V5, REG_R8, REG_R9, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V10, REG_R13, REG_R14, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V15, REG_R18, REG_R19, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V20, REG_R23, REG_R24, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V25, REG_R28, REG_R29, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_8BYTE, REG_V30, REG_R0, REG_R1, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld3r, EA_16BYTE, REG_V2, REG_R5, REG_R6, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP], Xm + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, REG_R5, INS_OPTS_8B); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V6, REG_R10, REG_R11, INS_OPTS_16B); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V12, REG_R16, REG_R17, INS_OPTS_4H); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V18, REG_R22, REG_R23, INS_OPTS_8H); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V24, REG_R28, REG_R29, INS_OPTS_2S); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V30, REG_R2, REG_R3, INS_OPTS_4S); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_8BYTE, REG_V4, REG_R8, REG_R9, INS_OPTS_1D); + theEmitter->emitIns_R_R_R(INS_ld4r, EA_16BYTE, REG_V10, REG_R14, REG_R15, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V0, REG_R1, 8, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V2, REG_R3, 16, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V4, REG_R5, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V6, REG_R7, 16, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V8, REG_R9, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V10, REG_R11, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1, EA_16BYTE, REG_V14, REG_R15, 16, INS_OPTS_2D); + + // ld1 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_2regs, EA_16BYTE, REG_V21, REG_R23, 32, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_3regs, EA_16BYTE, REG_V28, REG_SP, 48, INS_OPTS_2D); + + // ld1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1_4regs, EA_16BYTE, REG_V3, REG_R7, 64, INS_OPTS_2D); + + // ld2 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld2, EA_16BYTE, REG_V18, REG_R20, 32, INS_OPTS_2D); + + // ld3 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld3, EA_16BYTE, REG_V24, REG_R27, 48, INS_OPTS_2D); + + // ld4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld4, EA_16BYTE, REG_V30, REG_R2, 64, INS_OPTS_2D); + + // st1 {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V0, REG_R1, 8, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V2, REG_R3, 16, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V4, REG_R5, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V6, REG_R7, 16, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V8, REG_R9, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V10, REG_R11, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1, EA_16BYTE, REG_V14, REG_R15, 16, INS_OPTS_2D); + + // st1 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_2regs, EA_16BYTE, REG_V21, REG_R23, 32, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_3regs, EA_16BYTE, REG_V28, REG_SP, 48, INS_OPTS_2D); + + // st1 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_st1_4regs, EA_16BYTE, REG_V3, REG_R7, 64, INS_OPTS_2D); + + // st2 {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V0, REG_R2, 16, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V3, REG_R5, 32, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V6, REG_R8, 16, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V9, REG_R11, 32, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V12, REG_R14, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V15, REG_R17, 32, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st2, EA_16BYTE, REG_V18, REG_R20, 32, INS_OPTS_2D); + + // st3 {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V0, REG_R3, 24, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V4, REG_R7, 48, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V8, REG_R11, 24, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V12, REG_R15, 48, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V16, REG_R19, 24, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V20, REG_R23, 48, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st3, EA_16BYTE, REG_V24, REG_R27, 48, INS_OPTS_2D); + + // st4 {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V0, REG_R4, 32, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V5, REG_R9, 64, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V10, REG_R14, 32, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V15, REG_R19, 64, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V20, REG_R24, 32, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V25, REG_R29, 64, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_st4, EA_16BYTE, REG_V30, REG_R2, 64, INS_OPTS_2D); + + // ld1r {Vt}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V0, REG_R1, 1, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V2, REG_R3, 1, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V4, REG_R5, 2, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V6, REG_R7, 2, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V8, REG_R9, 4, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V10, REG_R11, 4, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_8BYTE, REG_V12, REG_R13, 8, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld1r, EA_16BYTE, REG_V14, REG_R15, 8, INS_OPTS_2D); + + // ld2r {Vt, Vt2}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V0, REG_R2, 2, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V3, REG_R5, 2, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V6, REG_R8, 4, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V9, REG_R11, 4, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V12, REG_R14, 8, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V15, REG_R17, 8, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_8BYTE, REG_V18, REG_R20, 16, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld2r, EA_16BYTE, REG_V21, REG_R23, 16, INS_OPTS_2D); + + // ld3r {Vt, Vt2, Vt3}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V0, REG_R3, 3, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V4, REG_R7, 3, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V8, REG_R11, 6, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V12, REG_R15, 6, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V16, REG_R19, 12, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V20, REG_R23, 12, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_8BYTE, REG_V24, REG_R27, 24, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld3r, EA_16BYTE, REG_V28, REG_SP, 24, INS_OPTS_2D); + + // ld4r {Vt, Vt2, Vt3, Vt4}, [Xn|SP], #imm + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V0, REG_R4, 4, INS_OPTS_8B); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V5, REG_R9, 4, INS_OPTS_16B); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V10, REG_R14, 8, INS_OPTS_4H); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V15, REG_R19, 8, INS_OPTS_8H); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V20, REG_R24, 16, INS_OPTS_2S); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V25, REG_R29, 16, INS_OPTS_4S); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_8BYTE, REG_V30, REG_R2, 32, INS_OPTS_1D); + theEmitter->emitIns_R_R_I(INS_ld4r, EA_16BYTE, REG_V3, REG_R7, 32, INS_OPTS_2D); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, 3); + theEmitter->emitIns_R_R_I(INS_ld1, EA_2BYTE, REG_V2, REG_R3, 2); + theEmitter->emitIns_R_R_I(INS_ld1, EA_4BYTE, REG_V4, REG_R5, 1); + theEmitter->emitIns_R_R_I(INS_ld1, EA_8BYTE, REG_V6, REG_R7, 0); + + // ld2 {Vt, Vt2}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, 4); + theEmitter->emitIns_R_R_I(INS_ld2, EA_2BYTE, REG_V3, REG_R5, 3); + theEmitter->emitIns_R_R_I(INS_ld2, EA_4BYTE, REG_V6, REG_R8, 2); + theEmitter->emitIns_R_R_I(INS_ld2, EA_8BYTE, REG_V9, REG_R11, 1); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, 5); + theEmitter->emitIns_R_R_I(INS_ld3, EA_2BYTE, REG_V4, REG_R7, 4); + theEmitter->emitIns_R_R_I(INS_ld3, EA_4BYTE, REG_V8, REG_R11, 3); + theEmitter->emitIns_R_R_I(INS_ld3, EA_8BYTE, REG_V12, REG_R15, 0); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, 6); + theEmitter->emitIns_R_R_I(INS_ld4, EA_2BYTE, REG_V5, REG_R9, 5); + theEmitter->emitIns_R_R_I(INS_ld4, EA_4BYTE, REG_V10, REG_R14, 0); + theEmitter->emitIns_R_R_I(INS_ld4, EA_8BYTE, REG_V15, REG_R19, 1); + + // st1 {Vt}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, 7); + theEmitter->emitIns_R_R_I(INS_st1, EA_2BYTE, REG_V2, REG_R3, 6); + theEmitter->emitIns_R_R_I(INS_st1, EA_4BYTE, REG_V4, REG_R5, 1); + theEmitter->emitIns_R_R_I(INS_st1, EA_8BYTE, REG_V6, REG_R7, 0); + + // st2 {Vt, Vt2}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, 8); + theEmitter->emitIns_R_R_I(INS_st2, EA_2BYTE, REG_V3, REG_R5, 7); + theEmitter->emitIns_R_R_I(INS_st2, EA_4BYTE, REG_V6, REG_R8, 2); + theEmitter->emitIns_R_R_I(INS_st2, EA_8BYTE, REG_V9, REG_R11, 1); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, 9); + theEmitter->emitIns_R_R_I(INS_st3, EA_2BYTE, REG_V4, REG_R7, 0); + theEmitter->emitIns_R_R_I(INS_st3, EA_4BYTE, REG_V8, REG_R11, 3); + theEmitter->emitIns_R_R_I(INS_st3, EA_8BYTE, REG_V12, REG_R15, 0); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP] + theEmitter->emitIns_R_R_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, 10); + theEmitter->emitIns_R_R_I(INS_st4, EA_2BYTE, REG_V5, REG_R9, 1); + theEmitter->emitIns_R_R_I(INS_st4, EA_4BYTE, REG_V10, REG_R14, 0); + theEmitter->emitIns_R_R_I(INS_st4, EA_8BYTE, REG_V15, REG_R19, 1); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, REG_R2, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_2BYTE, REG_V3, REG_R4, REG_R5, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_4BYTE, REG_V6, REG_R7, REG_R8, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld1, EA_8BYTE, REG_V9, REG_R10, REG_R11, 0, INS_OPTS_POST_INDEX); + + // ld2 {Vt, Vt2}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, REG_R3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_2BYTE, REG_V4, REG_R6, REG_R7, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_4BYTE, REG_V8, REG_R10, REG_R11, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld2, EA_8BYTE, REG_V12, REG_R14, REG_R15, 1, INS_OPTS_POST_INDEX); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, REG_R4, 5, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_2BYTE, REG_V5, REG_R8, REG_R9, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_4BYTE, REG_V10, REG_R13, REG_R14, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld3, EA_8BYTE, REG_V15, REG_R18, REG_R19, 0, INS_OPTS_POST_INDEX); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, REG_R5, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_2BYTE, REG_V6, REG_R10, REG_R11, 5, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_4BYTE, REG_V12, REG_R16, REG_R17, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_ld4, EA_8BYTE, REG_V18, REG_R22, REG_R23, 1, INS_OPTS_POST_INDEX); + + // st1 {Vt}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, REG_R2, 7, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_2BYTE, REG_V3, REG_R4, REG_R5, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_4BYTE, REG_V6, REG_R7, REG_R8, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st1, EA_8BYTE, REG_V9, REG_R10, REG_R11, 0, INS_OPTS_POST_INDEX); + + // st2 {Vt, Vt2}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, REG_R3, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_2BYTE, REG_V4, REG_R6, REG_R7, 7, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_4BYTE, REG_V8, REG_R10, REG_R11, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st2, EA_8BYTE, REG_V12, REG_R14, REG_R15, 1, INS_OPTS_POST_INDEX); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, REG_R4, 9, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_2BYTE, REG_V5, REG_R8, REG_R9, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_4BYTE, REG_V10, REG_R13, REG_R14, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st3, EA_8BYTE, REG_V15, REG_R18, REG_R19, 0, INS_OPTS_POST_INDEX); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], Xm + theEmitter->emitIns_R_R_R_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, REG_R5, 10, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_2BYTE, REG_V6, REG_R10, REG_R11, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_4BYTE, REG_V12, REG_R16, REG_R17, 0, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_R_I(INS_st4, EA_8BYTE, REG_V18, REG_R22, REG_R23, 1, INS_OPTS_POST_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS + // + // Loads to /Stores from one, two, three, or four SIMD&FP registers + // + + genDefineTempLabel(genCreateTempLabel()); + + // ld1 {Vt}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_1BYTE, REG_V0, REG_R1, 3, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_2BYTE, REG_V2, REG_R3, 2, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_4BYTE, REG_V4, REG_R5, 1, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld1, EA_8BYTE, REG_V6, REG_R7, 0, 8, INS_OPTS_POST_INDEX); + + // ld2 {Vt, Vt2}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_1BYTE, REG_V0, REG_R2, 4, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_2BYTE, REG_V3, REG_R5, 3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_4BYTE, REG_V6, REG_R8, 2, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld2, EA_8BYTE, REG_V9, REG_R11, 1, 16, INS_OPTS_POST_INDEX); + + // ld3 {Vt, Vt2, Vt3}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_1BYTE, REG_V0, REG_R3, 5, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_2BYTE, REG_V4, REG_R7, 4, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_4BYTE, REG_V8, REG_R11, 3, 12, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld3, EA_8BYTE, REG_V12, REG_R15, 0, 24, INS_OPTS_POST_INDEX); + + // ld4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_1BYTE, REG_V0, REG_R4, 6, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_2BYTE, REG_V5, REG_R9, 5, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_4BYTE, REG_V10, REG_R14, 0, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_ld4, EA_8BYTE, REG_V15, REG_R19, 1, 32, INS_OPTS_POST_INDEX); + + // st1 {Vt}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st1, EA_1BYTE, REG_V0, REG_R1, 3, 1, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_2BYTE, REG_V2, REG_R3, 2, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_4BYTE, REG_V4, REG_R5, 1, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st1, EA_8BYTE, REG_V6, REG_R7, 0, 8, INS_OPTS_POST_INDEX); + + // st2 {Vt, Vt2}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st2, EA_1BYTE, REG_V0, REG_R2, 4, 2, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_2BYTE, REG_V3, REG_R5, 3, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_4BYTE, REG_V6, REG_R8, 2, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st2, EA_8BYTE, REG_V9, REG_R11, 1, 16, INS_OPTS_POST_INDEX); + + // st3 {Vt, Vt2, Vt3}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st3, EA_1BYTE, REG_V0, REG_R3, 5, 3, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_2BYTE, REG_V4, REG_R7, 4, 6, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_4BYTE, REG_V8, REG_R11, 3, 12, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st3, EA_8BYTE, REG_V12, REG_R15, 0, 24, INS_OPTS_POST_INDEX); + + // st4 {Vt, Vt2, Vt3, Vt4}[#index], [Xn|SP], #imm + theEmitter->emitIns_R_R_I_I(INS_st4, EA_1BYTE, REG_V0, REG_R4, 6, 4, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_2BYTE, REG_V5, REG_R9, 5, 8, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_4BYTE, REG_V10, REG_R14, 0, 16, INS_OPTS_POST_INDEX); + theEmitter->emitIns_R_R_I_I(INS_st4, EA_8BYTE, REG_V15, REG_R19, 1, 32, INS_OPTS_POST_INDEX); + +#endif // ALL_ARM64_EMITTER_UNIT_TESTS + #ifdef ALL_ARM64_EMITTER_UNIT_TESTS // // Compares From b4aa743d608f413b2c459176a8c011335effb516 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 12:14:05 -0700 Subject: [PATCH 08/24] Add emitter::emitDispElemsize in emitarm64.cpp emitarm64.h --- src/coreclr/src/jit/emitarm64.cpp | 51 +++++++++++++++++++------------ src/coreclr/src/jit/emitarm64.h | 1 + 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 8d11f8c0bd879..c150f2acdc607 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -10810,26 +10810,7 @@ void emitter::emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t i { assert(isVectorRegister(reg)); printf(emitVectorRegName(reg)); - - switch (elemsize) - { - case EA_1BYTE: - printf(".b"); - break; - case EA_2BYTE: - printf(".h"); - break; - case EA_4BYTE: - printf(".s"); - break; - case EA_8BYTE: - printf(".d"); - break; - default: - assert(!"invalid elemsize"); - break; - } - + emitDispElemsize(elemsize); printf("[%d]", index); if (addComma) @@ -10878,6 +10859,36 @@ void emitter::emitDispArrangement(insOpts opt) printf(str); } +//------------------------------------------------------------------------ +// emitDispElemsize: Display a SIMD vector element suffix +// +void emitter::emitDispElemsize(emitAttr elemsize) +{ + const char* str = "???"; + + switch (elemsize) + { + case EA_1BYTE: + str = ".b"; + break; + case EA_2BYTE: + str = ".h"; + break; + case EA_4BYTE: + str = ".s"; + break; + case EA_8BYTE: + str = ".d"; + break; + + default: + assert(!"invalid elemsize"); + break; + } + + printf(str); +} + /***************************************************************************** * * Display a register with an optional shift operation diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index 4bdd715b4b23b..38a47cb87094b 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -40,6 +40,7 @@ void emitDispReg(regNumber reg, emitAttr attr, bool addComma); void emitDispVectorReg(regNumber reg, insOpts opt, bool addComma); void emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t index, bool addComma); void emitDispArrangement(insOpts opt); +void emitDispElemsize(emitAttr elemsize); void emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr); void emitDispExtendReg(regNumber reg, insOpts opt, ssize_t imm); void emitDispAddrRI(regNumber reg, insOpts opt, ssize_t imm); From 4e1ce142d8e2d1a9678d55a9667a008b7054085a Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 12:14:58 -0700 Subject: [PATCH 09/24] Update functions' headers in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 35 +++++++++++++------------------ 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index c150f2acdc607..1c9b1e4e70279 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -10775,10 +10775,9 @@ void emitter::emitDispLSExtendOpts(insOpts opt) assert(!"Bad value"); } -/***************************************************************************** - * - * Display a register - */ +//------------------------------------------------------------------------ +// emitDispReg: Display a general-purpose register name or SIMD and floating-point scalar register name +// void emitter::emitDispReg(regNumber reg, emitAttr attr, bool addComma) { emitAttr size = EA_SIZE(attr); @@ -10788,10 +10787,9 @@ void emitter::emitDispReg(regNumber reg, emitAttr attr, bool addComma) printf(", "); } -/***************************************************************************** - * - * Display a vector register with an arrangement suffix - */ +//------------------------------------------------------------------------ +// emitDispVectorReg: Display a SIMD vector register name with with an arrangement suffix +// void emitter::emitDispVectorReg(regNumber reg, insOpts opt, bool addComma) { assert(isVectorRegister(reg)); @@ -10802,10 +10800,9 @@ void emitter::emitDispVectorReg(regNumber reg, insOpts opt, bool addComma) printf(", "); } -/***************************************************************************** - * - * Display an vector register index suffix - */ +//------------------------------------------------------------------------ +// emitDispVectorRegIndex: Display a SIMD vector register name with element index +// void emitter::emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t index, bool addComma) { assert(isVectorRegister(reg)); @@ -10817,10 +10814,9 @@ void emitter::emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t i printf(", "); } -/***************************************************************************** - * - * Display an arrangement suffix - */ +//------------------------------------------------------------------------ +// emitDispArrangement: Display a SIMD vector arrangement suffix +// void emitter::emitDispArrangement(insOpts opt) { const char* str = "???"; @@ -10889,10 +10885,9 @@ void emitter::emitDispElemsize(emitAttr elemsize) printf(str); } -/***************************************************************************** - * - * Display a register with an optional shift operation - */ +//------------------------------------------------------------------------ +// emitDispShiftedReg: Display a register with an optional shift operation +// void emitter::emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr) { emitAttr size = EA_SIZE(attr); From 9cca19b594acb6945c5b7796dd69c10be888b3b4 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 12:26:22 -0700 Subject: [PATCH 10/24] Add emitDispVectorRegList and emitDispVectorElemList in emitarm64.cpp emitarm64.h --- src/coreclr/src/jit/emitarm64.cpp | 55 +++++++++++++++++++++++++++++++ src/coreclr/src/jit/emitarm64.h | 2 ++ 2 files changed, 57 insertions(+) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 1c9b1e4e70279..f3cba1d3af1de 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -10814,6 +10814,61 @@ void emitter::emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t i printf(", "); } +//------------------------------------------------------------------------ +// emitDispVectorRegList: Display a SIMD vector register list +// +void emitter::emitDispVectorRegList(regNumber firstReg, unsigned listSize, insOpts opt, bool addComma) +{ + assert(isVectorRegister(firstReg)); + + regNumber currReg = firstReg; + + printf("{"); + for (unsigned i = 0; i < listSize; i++) + { + const bool notLastRegister = (i != listSize - 1); + emitDispVectorReg(currReg, opt, notLastRegister); + currReg = (currReg == REG_V31) ? REG_V0 : REG_NEXT(currReg); + } + printf("}"); + + if (addComma) + { + printf(", "); + } +} + +//------------------------------------------------------------------------ +// emitDispVectorElemList: Display a SIMD vector element list +// +void emitter::emitDispVectorElemList( + regNumber firstReg, unsigned listSize, emitAttr elemsize, unsigned index, bool addComma) +{ + assert(isVectorRegister(firstReg)); + + regNumber currReg = firstReg; + + printf("{"); + for (unsigned i = 0; i < listSize; i++) + { + printf(emitVectorRegName(currReg)); + emitDispElemsize(elemsize); + const bool notLastRegister = (i != listSize - 1); + if (notLastRegister) + { + printf(", "); + } + currReg = (currReg == REG_V31) ? REG_V0 : REG_NEXT(currReg); + } + printf("}"); + printf("[%d]", index); + + if (addComma) + { + printf(", "); + } +} + //------------------------------------------------------------------------ // emitDispArrangement: Display a SIMD vector arrangement suffix // diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index 38a47cb87094b..683aee4b6f344 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -39,6 +39,8 @@ void emitDispLSExtendOpts(insOpts opt); void emitDispReg(regNumber reg, emitAttr attr, bool addComma); void emitDispVectorReg(regNumber reg, insOpts opt, bool addComma); void emitDispVectorRegIndex(regNumber reg, emitAttr elemsize, ssize_t index, bool addComma); +void emitDispVectorRegList(regNumber firstReg, unsigned listSize, insOpts opt, bool addComma); +void emitDispVectorElemList(regNumber firstReg, unsigned listSize, emitAttr elemsize, unsigned index, bool addComma); void emitDispArrangement(insOpts opt); void emitDispElemsize(emitAttr elemsize); void emitDispShiftedReg(regNumber reg, insOpts opt, ssize_t imm, emitAttr attr); From cd88213865d5981fc19b34be9a4163ba6a8e281f Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 12:41:38 -0700 Subject: [PATCH 11/24] Add insGetLoadStoreVectorSelem in emitarm64.cpp emitarm64.h --- src/coreclr/src/jit/emitarm64.cpp | 64 ++++++++++++++++++++++++++----- src/coreclr/src/jit/emitarm64.h | 3 ++ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index f3cba1d3af1de..28fa9404f7c91 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -1261,11 +1261,9 @@ static const char * const bRegNames[] = }; // clang-format on -/***************************************************************************** - * - * Return a string that represents the given register. - */ - +//------------------------------------------------------------------------ +// emitRegName: Returns a string that a general-purpose register name or SIMD and floating-point scalar register name +// const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) { assert(reg < REG_COUNT); @@ -1301,11 +1299,9 @@ const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) return rn; } -/***************************************************************************** - * - * Return a string that represents the given register. - */ - +//------------------------------------------------------------------------ +// emitVectorRegName: Returns a string that represents a SIMD vector register name +// const char* emitter::emitVectorRegName(regNumber reg) { assert((reg >= REG_V0) && (reg <= REG_V31)); @@ -1314,6 +1310,7 @@ const char* emitter::emitVectorRegName(regNumber reg) return vRegNames[index]; } + #endif // DEBUG /***************************************************************************** @@ -3230,6 +3227,53 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) return false; } +//------------------------------------------------------------------------ +// insGetLoadStoreVectorSelem: Returns a number of structure elements for a Load/Store Vector instruction +// +/*static*/ unsigned emitter::insGetLoadStoreVectorSelem(instruction ins) +{ + unsigned selem = 0; + + switch (ins) + { + case INS_ld1: + case INS_ld1r: + case INS_st1: + selem = 1; + break; + + case INS_ld1_2regs: + case INS_ld2: + case INS_ld2r: + case INS_st1_2regs: + case INS_st2: + selem = 2; + break; + + case INS_ld1_3regs: + case INS_ld3: + case INS_ld3r: + case INS_st1_3regs: + case INS_st3: + selem = 3; + break; + + case INS_ld1_4regs: + case INS_ld4: + case INS_ld4r: + case INS_st1_4regs: + case INS_st4: + selem = 4; + break; + + default: + assert(!"Unexpected instruction"); + break; + } + + return selem; +} + // For the given 'arrangement' returns the 'datasize' specified by the vector register arrangement // asserts and returns EA_UNKNOWN if an invalid 'arrangement' value is passed // diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index 683aee4b6f344..fbe31d7b29bd0 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -448,6 +448,9 @@ static emitAttr optGetSrcsize(insOpts conversion); // for an element of size 'elemsize' in a vector register of size 'datasize' static bool isValidVectorIndex(emitAttr datasize, emitAttr elemsize, ssize_t index); +// For a given Load/Store Vector instruction 'ins' returns a number of structure elements +static unsigned insGetLoadStoreVectorSelem(instruction ins); + /************************************************************************/ /* Public inline informational methods */ /************************************************************************/ From 8fe0715c2831351a196a1495ffbc517193fcd6c9 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 12:48:07 -0700 Subject: [PATCH 12/24] Update emitIns_R_R in emitarm64.cpp * Load/Store multiple structures base register * Load single structure and replicate base register --- src/coreclr/src/jit/emitarm64.cpp | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 28fa9404f7c91..8c446f3fb53de 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -4532,15 +4532,37 @@ void emitter::emitIns_R_R( fmt = IF_DV_2P; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + case INS_ld1: - { + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: assert(isVectorRegister(reg1)); - assert(isIntegerRegister(reg2)); + assert(isGeneralRegisterOrSP(reg2)); assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - fmt = IF_LS_2D; + + // Load/Store multiple structures base register + // Load single structure and replicate base register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_2D; break; - } default: unreached(); From e7ce778b69adc17ac646e94071a3571166f4ef0e Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 18:38:06 -0700 Subject: [PATCH 13/24] Update emitIns_R_R_I in emitarm64.cpp * Load/Store multiple structures post-indexed by an immediate * Load/Store single structure base register --- src/coreclr/src/jit/emitarm64.cpp | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 8c446f3fb53de..9bfbe6ebd663f 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -4705,6 +4705,7 @@ void emitter::emitIns_R_R_I( { bool canEncode; bitMaskImm bmi; + unsigned selem; case INS_mov: // Check for the 'mov' aliases for the vector registers @@ -5101,6 +5102,53 @@ void emitter::emitIns_R_R_I( isLdSt = true; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + + case INS_ld1: + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + reg2 = encodingSPtoZR(reg2); + + if (insOptsAnyArrangement(opt)) + { + selem = insGetLoadStoreVectorSelem(ins); + assert(isValidVectorDatasize(size)); + assert(isValidArrangement(size, opt)); + assert((size * selem) == imm); + + // Load/Store multiple structures post-indexed by an immediate + fmt = IF_LS_2E; + } + else + { + assert(insOptsNone(opt)); + assert((ins != INS_ld1_2regs) && (ins != INS_ld1_3regs) && (ins != INS_ld1_4regs) && + (ins != INS_st1_2regs) && (ins != INS_st1_3regs) && (ins != INS_st1_4regs)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + + // Load/Store single structure base register + fmt = IF_LS_2F; + } + break; + default: unreached(); break; From b49384201b26bbb41fcc937ab5eaad1bee7fd9b1 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 18:39:20 -0700 Subject: [PATCH 14/24] Update emitIns_R_R_I in emitarm64.cpp * Load single structure and replicate post-indexed by an immediate --- src/coreclr/src/jit/emitarm64.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 9bfbe6ebd663f..fafd162d57b51 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -5149,6 +5149,25 @@ void emitter::emitIns_R_R_I( } break; + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + assert(isValidVectorDatasize(size)); + assert(isValidArrangement(size, opt)); + + elemsize = optGetElemsize(opt); + selem = insGetLoadStoreVectorSelem(ins); + assert((elemsize * selem) == imm); + + // Load single structure and replicate post-indexed by an immediate + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_2E; + break; + default: unreached(); break; From ee4415bc932fbd70818025cd11849af9c935e847 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 14:48:13 -0700 Subject: [PATCH 15/24] Update emitIns_R_R_R in emitarm64.cpp * Load/Store multiple structures post-indexed by a register * Load single structure and replicate post-indexed by a register --- src/coreclr/src/jit/emitarm64.cpp | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index fafd162d57b51..cebf39dbbb3e7 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -5786,6 +5786,38 @@ void emitter::emitIns_R_R_R( fmt = IF_DV_3F; break; + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st2: + case INS_st3: + case INS_st4: + assert(opt != INS_OPTS_1D); // .1D format only permitted with LD1 & ST1 + __fallthrough; + + case INS_ld1: + case INS_ld1_2regs: + case INS_ld1_3regs: + case INS_ld1_4regs: + case INS_st1: + case INS_st1_2regs: + case INS_st1_3regs: + case INS_st1_4regs: + case INS_ld1r: + case INS_ld2r: + case INS_ld3r: + case INS_ld4r: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + assert(isGeneralRegister(reg3)); + assert(isValidArrangement(size, opt)); + + // Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_3F; + break; + default: unreached(); break; From bdc9df67cd55103bb3eceed468dd3905fef16336 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 14:54:11 -0700 Subject: [PATCH 16/24] Update emitIns_R_R_R_I in emitarm64.cpp * Load/Store single structure post-indexed by a register --- src/coreclr/src/jit/emitarm64.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index cebf39dbbb3e7..09c650bba1239 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -5989,6 +5989,29 @@ void emitter::emitIns_R_R_R_I(instruction ins, isLdSt = true; break; + case INS_ld1: + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st1: + case INS_st2: + case INS_st3: + case INS_st4: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + assert(isGeneralRegister(reg3)); + + assert(insOptsPostIndex(opt)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm)); + + // Load/Store single structure post-indexed by a register + reg2 = encodingSPtoZR(reg2); + fmt = IF_LS_3G; + break; + default: unreached(); break; From 66103d5f015f96956e3d762c819ecd8fd25b86e2 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 14:57:06 -0700 Subject: [PATCH 17/24] Update emitIns_R_R_I_I in emitarm64.cpp emitarm64.h * Load/Store single structure post-indexed by an immediate --- src/coreclr/src/jit/emitarm64.cpp | 34 ++++++++++++++++++++++++++++++- src/coreclr/src/jit/emitarm64.h | 3 ++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 09c650bba1239..3183ead9c8354 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -6282,7 +6282,8 @@ void emitter::emitIns_R_R_R_Ext(instruction ins, * Add an instruction referencing two registers and two constants. */ -void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2) +void emitter::emitIns_R_R_I_I( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2, insOpts opt) { emitAttr size = EA_SIZE(attr); emitAttr elemsize = EA_UNKNOWN; @@ -6295,6 +6296,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re int lsb; int width; bitMaskImm bmi; + unsigned selem; case INS_bfm: case INS_sbfm: @@ -6303,6 +6305,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re assert(isGeneralRegister(reg2)); assert(isValidImmShift(imm1, size)); assert(isValidImmShift(imm2, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = imm1; @@ -6320,6 +6323,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re width = imm2 - 1; assert(isValidImmShift(lsb, size)); assert(isValidImmShift(width, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = lsb; @@ -6337,6 +6341,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re width = imm2 + imm1 - 1; assert(isValidImmShift(lsb, size)); assert(isValidImmShift(width, size)); + assert(insOptsNone(opt)); bmi.immNRS = 0; bmi.immN = (size == EA_8BYTE); bmi.immR = imm1; @@ -6353,10 +6358,36 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re assert(isValidVectorElemsize(elemsize)); assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1)); assert(isValidVectorIndex(EA_16BYTE, elemsize, imm2)); + assert(insOptsNone(opt)); immOut = (imm1 << 4) + imm2; fmt = IF_DV_2F; break; + case INS_ld1: + case INS_ld2: + case INS_ld3: + case INS_ld4: + case INS_st1: + case INS_st2: + case INS_st3: + case INS_st4: + assert(isVectorRegister(reg1)); + assert(isGeneralRegisterOrSP(reg2)); + + elemsize = size; + assert(isValidVectorElemsize(elemsize)); + assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1)); + + selem = insGetLoadStoreVectorSelem(ins); + assert((elemsize * selem) == (unsigned)imm2); + assert(insOptsPostIndex(opt)); + + // Load/Store single structure post-indexed by an immediate + reg2 = encodingSPtoZR(reg2); + immOut = imm1; + fmt = IF_LS_2G; + break; + default: unreached(); break; @@ -6367,6 +6398,7 @@ void emitter::emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, re id->idIns(ins); id->idInsFmt(fmt); + id->idInsOpt(opt); id->idReg1(reg1); id->idReg2(reg2); diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index fbe31d7b29bd0..f26eb010d3d08 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -743,7 +743,8 @@ void emitIns_R_R_R_Ext(instruction ins, insOpts opt = INS_OPTS_NONE, int shiftAmount = -1); -void emitIns_R_R_I_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2); +void emitIns_R_R_I_I( + instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int imm1, int imm2, insOpts opt = INS_OPTS_NONE); void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4); From a0bfd420538ca0c41c15c7e59e5f5bb78d3c6ec5 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 16:48:45 -0700 Subject: [PATCH 18/24] Update emitDispIns in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 76 ++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 3183ead9c8354..31c3d3be1b38e 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -11429,6 +11429,7 @@ void emitter::emitDispIns( emitAttr dstsize; ssize_t index; ssize_t index2; + unsigned selem; case IF_BI_0A: // BI_0A ......iiiiiiiiii iiiiiiiiiiiiiiii simm26:00 case IF_BI_0B: // BI_0B ......iiiiiiiiii iiiiiiiiiii..... simm19:00 @@ -11603,17 +11604,41 @@ void emitter::emitDispIns( emitDispAddrRI(id->idReg2(), id->idInsOpt(), imm); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg2(), id->idInsOpt(), 0); + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + selem = insGetLoadStoreVectorSelem(id->idIns()); + emitDispVectorRegList(id->idReg1(), selem, id->idInsOpt(), true); + + if (fmt == IF_LS_2D) + { + // Load/Store multiple structures base register + // Load single structure and replicate base register + emitDispAddrRI(id->idReg2(), INS_OPTS_NONE, 0); + } + else + { + // Load/Store multiple structures post-indexed by an immediate + // Load single structure and replicate post-indexed by an immediate + emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, id->idSmallCns()); + } break; - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg2(), id->idInsOpt(), 0); + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn + selem = insGetLoadStoreVectorSelem(id->idIns()); + elemsize = id->idOpSize(); + emitDispVectorElemList(id->idReg1(), selem, elemsize, id->idSmallCns(), true); + + if (fmt == IF_LS_2F) + { + // Load/Store single structure base register + emitDispAddrRI(id->idReg2(), INS_OPTS_NONE, 0); + } + else + { + // Load/Store single structure post-indexed by an immediate + emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, (selem * elemsize)); + } break; case IF_LS_3A: // LS_3A .X.......X.mmmmm oooS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} @@ -11662,20 +11687,27 @@ void emitter::emitDispIns( emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispReg(id->idReg2(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); - break; + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm + selem = insGetLoadStoreVectorSelem(id->idIns()); - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - assert(insOptsNone(id->idInsOpt())); - assert(emitGetInsSC(id) == 0); - emitDispReg(id->idReg1(), emitInsTargetRegSize(id), true); - emitDispReg(id->idReg2(), emitInsTargetRegSize(id), true); - emitDispAddrRI(id->idReg3(), id->idInsOpt(), 0); + if (fmt == IF_LS_3F) + { + // Load/Store multiple structures post-indexed by a register + // Load single structure and replicate post-indexed by a register + emitDispVectorRegList(id->idReg1(), selem, id->idInsOpt(), true); + } + else + { + // Load/Store single structure post-indexed by a register + elemsize = id->idOpSize(); + emitDispVectorElemList(id->idReg1(), selem, elemsize, id->idSmallCns(), true); + } + + printf("["); + emitDispReg(encodingZRtoSP(id->idReg2()), EA_8BYTE, false); + printf("], "); + emitDispReg(id->idReg3(), EA_8BYTE, false); break; case IF_DI_1A: // DI_1A X.......shiiiiii iiiiiinnnnn..... Rn imm(i12,sh) From e77156b8ded5ffa197443af54b34e5d911cda35b Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 17:17:07 -0700 Subject: [PATCH 19/24] Update emitOutputInstr in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 48 +++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 31c3d3be1b38e..52f3173e27a15 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -9727,27 +9727,28 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn elemsize = optGetElemsize(id->idInsOpt()); code = emitInsCode(ins, fmt); code |= insEncodeVectorsize(id->idOpSize()); // Q - code |= 0x5000; // xxx - We only support the one register variant right now - code |= insEncodeVLSElemsize(elemsize); // ss - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSElemsize(elemsize); // ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - elemsize = optGetElemsize(id->idInsOpt()); - imm = emitGetInsSC(id); + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn + elemsize = id->idOpSize(); + index = id->idSmallCns(); code = emitInsCode(ins, fmt); - code |= insEncodeVLSIndex(elemsize, imm); // Q xx S ss - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSIndex(elemsize, index); // Q xx S ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; @@ -9849,29 +9850,28 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) dst += emitOutput_Instr(dst, code); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm elemsize = optGetElemsize(id->idInsOpt()); code = emitInsCode(ins, fmt); - code |= insEncodeReg_Vt(id->idReg1()); // ttttt - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeVLSElemsize(elemsize); // ss - code |= 0x5000; // xx.x - We only support the one register variant right now - code |= insEncodeReg_Rm(id->idReg3()); // mmmmm code |= insEncodeVectorsize(id->idOpSize()); // Q + code |= insEncodeReg_Rm(id->idReg3()); // mmmmm + code |= insEncodeVLSElemsize(elemsize); // ss + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - elemsize = optGetElemsize(id->idInsOpt()); - imm = emitGetInsSC(id); + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm + elemsize = id->idOpSize(); + index = id->idSmallCns(); code = emitInsCode(ins, fmt); - code |= insEncodeVLSIndex(elemsize, imm); // Q xx S ss - code |= insEncodeReg_Rm(id->idReg3()); // mmmmm - code |= insEncodeReg_Rn(id->idReg2()); // nnnnn - code |= insEncodeReg_Vt(id->idReg1()); // ttttt + code |= insEncodeVLSIndex(elemsize, index); // Q xx S ss + code |= insEncodeReg_Rm(id->idReg3()); // mmmmm + code |= insEncodeReg_Rn(id->idReg2()); // nnnnn + code |= insEncodeReg_Vt(id->idReg1()); // ttttt dst += emitOutput_Instr(dst, code); break; From 43aed1d5060391342600bbe64dc9c384022963e2 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 17:38:55 -0700 Subject: [PATCH 20/24] Update emitInsSanityCheck in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 65 ++++++++++++++++--------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 52f3173e27a15..42225e8cc8510 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -227,22 +227,24 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(insOptsNone(id->idInsOpt()) || insOptsIndexed(id->idInsOpt())); break; - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); - assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(emitGetInsSC(id) == 0); - assert(!id->idIsLclVar()); - break; - - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - elemsize = optGetElemsize(id->idInsOpt()); - assert(isValidVectorIndex(id->idOpSize(), elemsize, emitGetInsSC(id))); + assert(isIntegerRegister(id->idReg2())); // SP + if (insOptsAnyArrangement(id->idInsOpt())) + { + datasize = id->idOpSize(); + assert(isValidVectorDatasize(datasize)); + assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + } + else + { + elemsize = id->idOpSize(); + assert(isValidVectorElemsize(elemsize)); + assert(insOptsNone(id->idInsOpt()) || insOptsPostIndex(id->idInsOpt())); + } assert(!id->idIsLclVar()); break; @@ -304,24 +306,23 @@ void emitter::emitInsSanityCheck(instrDesc* id) assert(insOptsNone(id->idInsOpt())); break; - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); - assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(isIntegerRegister(id->idReg3())); - assert(emitGetInsSC(id) == 0); - assert(!id->idIsLclVar()); - break; - - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm - assert(isValidVectorDatasize(id->idOpSize())); - assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm assert(isVectorRegister(id->idReg1())); - assert(isIntegerRegister(id->idReg2())); - assert(isIntegerRegister(id->idReg3())); - elemsize = optGetElemsize(id->idInsOpt()); - assert(isValidVectorIndex(id->idOpSize(), elemsize, emitGetInsSC(id))); + assert(isIntegerRegister(id->idReg2())); // SP + assert(isGeneralRegister(id->idReg3())); + if (insOptsAnyArrangement(id->idInsOpt())) + { + datasize = id->idOpSize(); + assert(isValidVectorDatasize(datasize)); + assert(isValidArrangement(id->idOpSize(), id->idInsOpt())); + } + else + { + elemsize = id->idOpSize(); + assert(isValidVectorElemsize(elemsize)); + assert(insOptsNone(id->idInsOpt()) || insOptsPostIndex(id->idInsOpt())); + } assert(!id->idIsLclVar()); break; From 683a20d8524255ed5bb482c0d9cd4e86480e064c Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 17:44:16 -0700 Subject: [PATCH 21/24] Update emitInsMayWriteToGCReg in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 42225e8cc8510..e08862d9fd209 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -957,14 +957,16 @@ bool emitter::emitInsMayWriteToGCReg(instrDesc* id) case IF_LS_2A: // LS_2A .X.......X...... ......nnnnnttttt Rt Rn case IF_LS_2B: // LS_2B .X.......Xiiiiii iiiiiinnnnnttttt Rt Rn imm(0-4095) case IF_LS_2C: // LS_2C .X.......X.iiiii iiiiP.nnnnnttttt Rt Rn imm(-256..+255) pre/post inc - case IF_LS_2D: // LS_2D .Q.............. xx.xssnnnnnttttt Vt Rn - case IF_LS_2E: // LS_2E .Q.............. xx.Sssnnnnnttttt Vt[] Rn + case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn + case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn + case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn case IF_LS_3A: // LS_3A .X.......X.mmmmm xxxS..nnnnnttttt Rt Rn Rm ext(Rm) LSL {} case IF_LS_3B: // LS_3B X............... .aaaaannnnnttttt Rt Ra Rn case IF_LS_3C: // LS_3C X.........iiiiii iaaaaannnnnttttt Rt Ra Rn imm(im7,sh) case IF_LS_3D: // LS_3D .X.......X.mmmmm ......nnnnnttttt Wm Rt Rn - case IF_LS_3F: // LS_3F .Q.........mmmmm xx.xssnnnnnttttt Vt Rn Rm - case IF_LS_3G: // LS_3G .Q.........mmmmm xx.Sssnnnnnttttt Vt[] Rn Rm + case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm + case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm // For the Store instructions the "target" register is actually a "source" value From 6a53804d62e26aae08e34ee40791e97d534a75c3 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 18:41:23 -0700 Subject: [PATCH 22/24] Remove ld1 in emitInsTargetRegSize in emitarm64.cpp --- src/coreclr/src/jit/emitarm64.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index e08862d9fd209..c494d451be54d 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -1130,7 +1130,6 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id) case INS_str: case INS_ldur: case INS_stur: - case INS_ld1: result = id->idOpSize(); break; From 1e482c5772347f51d70bbb6ee885a9d8248ee1ed Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 10 Mar 2020 19:41:11 -0700 Subject: [PATCH 23/24] Update getMemoryOperation and getInsExecutionCharacteristics in emit.h emitarm64.cpp --- src/coreclr/src/jit/emit.h | 2 + src/coreclr/src/jit/emitarm64.cpp | 381 +++++++++++++++++++++++++++++- 2 files changed, 370 insertions(+), 13 deletions(-) diff --git a/src/coreclr/src/jit/emit.h b/src/coreclr/src/jit/emit.h index f40b01d0259c7..dda33e19a36ba 100644 --- a/src/coreclr/src/jit/emit.h +++ b/src/coreclr/src/jit/emit.h @@ -1233,6 +1233,8 @@ class emitter #define PERFSCORE_THROUGHPUT_4C 4.0f // slower - 4 cycles #define PERFSCORE_THROUGHPUT_5C 5.0f // slower - 5 cycles #define PERFSCORE_THROUGHPUT_6C 6.0f // slower - 6 cycles +#define PERFSCORE_THROUGHPUT_7C 7.0f // slower - 7 cycles +#define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles #define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index c494d451be54d..015d46a4f014e 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -12632,6 +12632,8 @@ void emitter::getMemoryOperation(instrDesc* id, unsigned* pMemAccessKind, bool* case IF_LS_2C: case IF_LS_2D: case IF_LS_2E: + case IF_LS_2F: + case IF_LS_2G: case IF_LS_3A: case IF_LS_3F: case IF_LS_3G: @@ -13033,22 +13035,375 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } break; - case IF_LS_2D: // ld1 (vector - multiple structures) - case IF_LS_2E: // ld1 (vector - single structure) - case IF_LS_3F: // ld1 (vector - multiple structures) - case IF_LS_3G: // ld1 (vector - single structure) - if (id->idOpSize() == EA_8BYTE) + case IF_LS_2D: + case IF_LS_2E: + case IF_LS_3F: + // Load/Store multiple structures + // Load single structure and replicate + switch (ins) { - // D-form - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_3C; + case INS_ld1: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld1_2regs: + case INS_ld2: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + break; + + case INS_ld1_3regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_8C; + } + break; + + case INS_ld1_4regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_8C; + result.insLatency = PERFSCORE_LATENCY_10C; + } + break; + + case INS_ld3: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + if (optGetElemsize(id->idInsOpt()) == EA_4BYTE) + { + // S + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + if ((optGetElemsize(id->idInsOpt()) == EA_4BYTE) || + (optGetElemsize(id->idInsOpt()) == EA_8BYTE)) + { + // S/D + result.insThroughput = PERFSCORE_THROUGHPUT_6C; + result.insLatency = PERFSCORE_LATENCY_8C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_7C; + result.insLatency = PERFSCORE_LATENCY_9C; + } + } + break; + + case INS_ld4: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + if (optGetElemsize(id->idInsOpt()) == EA_4BYTE) + { + // S + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_5C; + result.insLatency = PERFSCORE_LATENCY_7C; + } + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + if ((optGetElemsize(id->idInsOpt()) == EA_4BYTE) || + (optGetElemsize(id->idInsOpt()) == EA_8BYTE)) + { + // S/D + result.insThroughput = PERFSCORE_THROUGHPUT_8C; + result.insLatency = PERFSCORE_LATENCY_10C; + } + else + { + // B/H + result.insThroughput = PERFSCORE_THROUGHPUT_9C; + result.insLatency = PERFSCORE_LATENCY_11C; + } + } + break; + + case INS_ld1r: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_ld2r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_ld3r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld4r: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_st1_2regs: + case INS_st2: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + break; + + case INS_st1_3regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_st1_4regs: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // Q-form + assert(id->idOpSize() == EA_16BYTE); + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st3: + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_st4: + if (id->idOpSize() == EA_8BYTE) + { + // D-form + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + else + { + assert(id->idOpSize() == EA_16BYTE); + if (optGetElemsize(id->idInsOpt()) == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_5C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + } + break; + + default: + unreached(); } - else + break; + + case IF_LS_2F: + case IF_LS_2G: + case IF_LS_3G: + // Load/Store single structure + switch (ins) { - // Q-form - assert(id->idOpSize() == EA_16BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency = PERFSCORE_LATENCY_4C; + case INS_ld1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + break; + + case INS_ld2: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_3C; + } + break; + + case INS_ld3: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_3C; + result.insLatency = PERFSCORE_LATENCY_5C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_ld4: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_4C; + result.insLatency = PERFSCORE_LATENCY_6C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_4C; + } + break; + + case INS_st1: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + break; + + case INS_st2: + if (id->idOpSize() == EA_8BYTE) + { + // D + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + } + else + { + // B/H/S + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_1C; + } + break; + + case INS_st3: + case INS_st4: + result.insThroughput = PERFSCORE_THROUGHPUT_2C; + result.insLatency = PERFSCORE_LATENCY_2C; + break; + + default: + unreached(); } break; From c7af7d2e3b26529c3b0812d286fc8215b5654279 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Fri, 13 Mar 2020 11:55:14 -0700 Subject: [PATCH 24/24] Address Tanner's feedback on GitHub. --- src/coreclr/src/jit/emitarm64.cpp | 78 +++++++++++++++++++------------ src/coreclr/src/jit/emitarm64.h | 5 +- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index 015d46a4f014e..e31a9ab43b5bc 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -1264,7 +1264,15 @@ static const char * const bRegNames[] = // clang-format on //------------------------------------------------------------------------ -// emitRegName: Returns a string that a general-purpose register name or SIMD and floating-point scalar register name +// emitRegName: Returns a general-purpose register name or SIMD and floating-point scalar register name. +// +// Arguments: +// reg - A general-purpose register or SIMD and floating-point register. +// size - A register size. +// varName - unused parameter. +// +// Return value: +// A string that represents a general-purpose register name or SIMD and floating-point scalar register name. // const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) { @@ -1302,7 +1310,13 @@ const char* emitter::emitRegName(regNumber reg, emitAttr size, bool varName) } //------------------------------------------------------------------------ -// emitVectorRegName: Returns a string that represents a SIMD vector register name +// emitVectorRegName: Returns a SIMD vector register name. +// +// Arguments: +// reg - A SIMD and floating-point register. +// +// Return value: +// A string that represents a SIMD vector register name. // const char* emitter::emitVectorRegName(regNumber reg) { @@ -3230,18 +3244,24 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) } //------------------------------------------------------------------------ -// insGetLoadStoreVectorSelem: Returns a number of structure elements for a Load/Store Vector instruction +// insGetLoadStoreRegisterListSize: Returns a size of the register list a given instruction operates on. // -/*static*/ unsigned emitter::insGetLoadStoreVectorSelem(instruction ins) +// Arguments: +// ins - A Load/Store Vector instruction (e.g. ld1 (2 registers), ld1r, st1). +// +// Return value: +// A number of consecutive SIMD and floating-point registers the instruction loads to/store from. +// +/*static*/ unsigned emitter::insGetLoadStoreRegisterListSize(instruction ins) { - unsigned selem = 0; + unsigned registerListSize = 0; switch (ins) { case INS_ld1: case INS_ld1r: case INS_st1: - selem = 1; + registerListSize = 1; break; case INS_ld1_2regs: @@ -3249,7 +3269,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) case INS_ld2r: case INS_st1_2regs: case INS_st2: - selem = 2; + registerListSize = 2; break; case INS_ld1_3regs: @@ -3257,7 +3277,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) case INS_ld3r: case INS_st1_3regs: case INS_st3: - selem = 3; + registerListSize = 3; break; case INS_ld1_4regs: @@ -3265,7 +3285,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) case INS_ld4r: case INS_st1_4regs: case INS_st4: - selem = 4; + registerListSize = 4; break; default: @@ -3273,7 +3293,7 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) break; } - return selem; + return registerListSize; } // For the given 'arrangement' returns the 'datasize' specified by the vector register arrangement @@ -4707,7 +4727,7 @@ void emitter::emitIns_R_R_I( { bool canEncode; bitMaskImm bmi; - unsigned selem; + unsigned registerListSize; case INS_mov: // Check for the 'mov' aliases for the vector registers @@ -5128,10 +5148,10 @@ void emitter::emitIns_R_R_I( if (insOptsAnyArrangement(opt)) { - selem = insGetLoadStoreVectorSelem(ins); + registerListSize = insGetLoadStoreRegisterListSize(ins); assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - assert((size * selem) == imm); + assert((size * registerListSize) == imm); // Load/Store multiple structures post-indexed by an immediate fmt = IF_LS_2E; @@ -5161,9 +5181,9 @@ void emitter::emitIns_R_R_I( assert(isValidVectorDatasize(size)); assert(isValidArrangement(size, opt)); - elemsize = optGetElemsize(opt); - selem = insGetLoadStoreVectorSelem(ins); - assert((elemsize * selem) == imm); + elemsize = optGetElemsize(opt); + registerListSize = insGetLoadStoreRegisterListSize(ins); + assert((elemsize * registerListSize) == imm); // Load single structure and replicate post-indexed by an immediate reg2 = encodingSPtoZR(reg2); @@ -6298,7 +6318,7 @@ void emitter::emitIns_R_R_I_I( int lsb; int width; bitMaskImm bmi; - unsigned selem; + unsigned registerListSize; case INS_bfm: case INS_sbfm: @@ -6380,8 +6400,8 @@ void emitter::emitIns_R_R_I_I( assert(isValidVectorElemsize(elemsize)); assert(isValidVectorIndex(EA_16BYTE, elemsize, imm1)); - selem = insGetLoadStoreVectorSelem(ins); - assert((elemsize * selem) == (unsigned)imm2); + registerListSize = insGetLoadStoreRegisterListSize(ins); + assert((elemsize * registerListSize) == (unsigned)imm2); assert(insOptsPostIndex(opt)); // Load/Store single structure post-indexed by an immediate @@ -11431,7 +11451,7 @@ void emitter::emitDispIns( emitAttr dstsize; ssize_t index; ssize_t index2; - unsigned selem; + unsigned registerListSize; case IF_BI_0A: // BI_0A ......iiiiiiiiii iiiiiiiiiiiiiiii simm26:00 case IF_BI_0B: // BI_0B ......iiiiiiiiii iiiiiiiiiii..... simm19:00 @@ -11608,8 +11628,8 @@ void emitter::emitDispIns( case IF_LS_2D: // LS_2D .Q.............. ....ssnnnnnttttt Vt Rn case IF_LS_2E: // LS_2E .Q.............. ....ssnnnnnttttt Vt Rn - selem = insGetLoadStoreVectorSelem(id->idIns()); - emitDispVectorRegList(id->idReg1(), selem, id->idInsOpt(), true); + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); + emitDispVectorRegList(id->idReg1(), registerListSize, id->idInsOpt(), true); if (fmt == IF_LS_2D) { @@ -11627,9 +11647,9 @@ void emitter::emitDispIns( case IF_LS_2F: // LS_2F .Q.............. ...Sssnnnnnttttt Vt[] Rn case IF_LS_2G: // LS_2G .Q.............. ...Sssnnnnnttttt Vt[] Rn - selem = insGetLoadStoreVectorSelem(id->idIns()); - elemsize = id->idOpSize(); - emitDispVectorElemList(id->idReg1(), selem, elemsize, id->idSmallCns(), true); + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); + elemsize = id->idOpSize(); + emitDispVectorElemList(id->idReg1(), registerListSize, elemsize, id->idSmallCns(), true); if (fmt == IF_LS_2F) { @@ -11639,7 +11659,7 @@ void emitter::emitDispIns( else { // Load/Store single structure post-indexed by an immediate - emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, (selem * elemsize)); + emitDispAddrRI(id->idReg2(), INS_OPTS_POST_INDEX, (registerListSize * elemsize)); } break; @@ -11691,19 +11711,19 @@ void emitter::emitDispIns( case IF_LS_3F: // LS_3F .Q.........mmmmm ....ssnnnnnttttt Vt Rn Rm case IF_LS_3G: // LS_3G .Q.........mmmmm ...Sssnnnnnttttt Vt[] Rn Rm - selem = insGetLoadStoreVectorSelem(id->idIns()); + registerListSize = insGetLoadStoreRegisterListSize(id->idIns()); if (fmt == IF_LS_3F) { // Load/Store multiple structures post-indexed by a register // Load single structure and replicate post-indexed by a register - emitDispVectorRegList(id->idReg1(), selem, id->idInsOpt(), true); + emitDispVectorRegList(id->idReg1(), registerListSize, id->idInsOpt(), true); } else { // Load/Store single structure post-indexed by a register elemsize = id->idOpSize(); - emitDispVectorElemList(id->idReg1(), selem, elemsize, id->idSmallCns(), true); + emitDispVectorElemList(id->idReg1(), registerListSize, elemsize, id->idSmallCns(), true); } printf("["); diff --git a/src/coreclr/src/jit/emitarm64.h b/src/coreclr/src/jit/emitarm64.h index f26eb010d3d08..f6286a6062360 100644 --- a/src/coreclr/src/jit/emitarm64.h +++ b/src/coreclr/src/jit/emitarm64.h @@ -448,8 +448,9 @@ static emitAttr optGetSrcsize(insOpts conversion); // for an element of size 'elemsize' in a vector register of size 'datasize' static bool isValidVectorIndex(emitAttr datasize, emitAttr elemsize, ssize_t index); -// For a given Load/Store Vector instruction 'ins' returns a number of structure elements -static unsigned insGetLoadStoreVectorSelem(instruction ins); +// For a given Load/Store Vector instruction 'ins' returns a number of consecutive SIMD registers +// the instruction loads to/store from. +static unsigned insGetLoadStoreRegisterListSize(instruction ins); /************************************************************************/ /* Public inline informational methods */