Skip to content

ARM64-SVE: Use non predicated instruction when mask is all true #114431

@a74nh

Description

@a74nh

Consider:

static void truecndselect1(Vector<int> op1, Vector<int> op2) {
  var result1 = Sve.ConditionalSelect(Sve.CreateTrueMaskInt32(), Sve.Add(op1, op2), op1);
  Consume(result1);
}

G_M21589_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M21589_IG02:  ;; offset=0x0008
            add     z0.s, z0.s, z1.s
            movz    x0, #0x72B8      // code for CSharpTutorials.Program:Consume[System.Numerics.Vector`1[int]](System.Numerics.Vector`1[int])
            movk    x0, #0x2218 LSL #16
            movk    x0, #0xE088 LSL #32
            ldr     x0, [x0]
            blr     x0
						;; size=24 bbWeight=1 PerfScore 7.50
G_M21589_IG03:  ;; offset=0x0020
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00



static void truecndselect2(Vector<int> op1, Vector<int> op2) {
  var result2 = Sve.ConditionalSelect(Vector<int>.AllBitsSet, Sve.Add(op1, op2), op1);
  Consume(result2);
}

G_M25078_IG01:  ;; offset=0x0000
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M25078_IG02:  ;; offset=0x0008
            ptrue   p0.s
            mvni    v16.4s, #0
            cmpne   p0.s, p0/z, z16.s, #0
            add     z0.s, p0/m, z0.s, z1.s
            movz    x0, #0x72B8      // code for CSharpTutorials.Program:Consume[System.Numerics.Vector`1[int]](System.Numerics.Vector`1[int])
            movk    x0, #0x2218 LSL #16
            movk    x0, #0xE088 LSL #32
            ldr     x0, [x0]
            blr     x0
						;; size=36 bbWeight=1 PerfScore 12.00
G_M25078_IG03:  ;; offset=0x002C
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

For both of these, a non-predicated ADD can be used, optimising away the mask. Becoming:

            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
            add     z0.s, z0.s, z1.s
            movz    x0, #0x72B8      // code for CSharpTutorials.Program:Consume[System.Numerics.Vector`1[int]](System.Numerics.Vector`1[int])
            movk    x0, #0x2218 LSL #16
            movk    x0, #0xE088 LSL #32
            ldr     x0, [x0]
            blr     x0
            ldp     fp, lr, [sp], #0x10
            ret     lr

This should be possible for all HW_Flag_OptionalEmbeddedMaskedOperation instructions: ADD, AND, BIC, ORR, SUB, EOR

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMIarm-sveWork related to arm64 SVE/SVE2 supportin-prThere is an active PR which will close this issue when it is merged

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions