-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Convert all user calls to indirect early on arm64 #89213
Conversation
Ugh, I guess I should not CSE/Hoist |
[Benchmark]
public void NonVirtual()
{
var x = Y;
for (var i = 0; i < 100000000; i++)
{
x.Dispose(); // non-virtual non-inlineable call
x.Dispose();
}
} Main: ; Assembly listing for method BasicBenchmark.Program+BasicBenchmark:NonVirtual():this (Tier1)
stp fp, lr, [sp, #-0x30]!
stp x19, x20, [sp, #0x18]
str x21, [sp, #0x28]
mov fp, sp
movz x0, #0x81A8
movk x0, #0x5800 LSL #16
movk x0, #1 LSL #32
ldr x19, [x0]
mov w20, wzr
movz w21, #0xE100
movk w21, #0x5F5 LSL #16
G_M000_IG03:
mov x0, x19
movz x1, #0x3B48
movk x1, #0x668 LSL #16
movk x1, #1 LSL #32
ldr x1, [x1]
ldr wzr, [x0]
blr x1
mov x0, x19
movz x1, #0x3B48
movk x1, #0x668 LSL #16
movk x1, #1 LSL #32
ldr x1, [x1]
blr x1
add w20, w20, #1
cmp w20, w21
blt G_M000_IG03
ldr x21, [sp, #0x28]
ldp x19, x20, [sp, #0x18]
ldp fp, lr, [sp], #0x30
ret lr
; Total bytes of code 124 PR: ; Assembly listing for method BasicBenchmark.Program+BasicBenchmark:NonVirtual():this (Tier1-OSR)
stp fp, lr, [sp, #-0x30]!
stp x19, x20, [sp, #0x10]
stp x21, x22, [sp, #0x20]
mov fp, sp
ldr x20, [fp, #0x50]
ldr w19, [fp, #0x4C]
movz w21, #0xE100
movk w21, #0x5F5 LSL #16
cmp w19, w21
bge G_M000_IG05
movz x22, #0x3918
movk x22, #0x4D4 LSL #16
movk x22, #1 LSL #32
G_M000_IG04:
mov x0, x20
ldr x1, [x22]
ldr wzr, [x0]
blr x1
mov x0, x20
ldr x1, [x22]
blr x1
add w19, w19, #1
cmp w19, w21
blt G_M000_IG04
G_M000_IG05:
ldp x21, x22, [sp, #0x20]
ldp x19, x20, [sp, #0x10]
ldp fp, lr, [sp], #0x30
add sp, sp, #48
ret lr
; Total bytes of code 112 Codegen diff: https://www.diffchecker.com/dSO0krs3/ It does look like my PR is supposed to make it faster since we hoist 2 constants from the loop body - but the benchmark says it's not 😢:
Apparently, CPU is more comforable with calls where target is populated right before the call? Anyway, so far I wasn't able to see perf wins from this... |
@a74nh - could you please check on your end why we don't see the expected wins here? |
Closes #88807
Codegen diff (on arm64): https://www.diffchecker.com/vHceD9dT/
(target address of
Console.WriteLine
is not CSE'd and hoisted from the loop)Seeing some nice diffs locally on arm64 SPMI:
Same PR but limitted only to loops (blocks with "backward-branch" flag) to mitigate TP regressions: