- 
                Notifications
    You must be signed in to change notification settings 
- Fork 5.2k
Unify unroll limits in a single entry point #83274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue Detailsnull 
 | 
| Fixes #82529? | 
| 
 Ah, didn't see this one. Yeah, it does. It zeroes         xor      eax, eax
       vxorps   ymm0, ymm0
       vmovdqu  ymmword ptr[rdx], ymm0
       vmovdqu  ymmword ptr[rdx+20H], ymm0
       vmovdqu  ymmword ptr[rdx+40H], ymm0
       vmovdqu  ymmword ptr[rdx+60H], ymm0
       vmovdqu  ymmword ptr[rdx+80H], ymm0
       mov      qword ptr [rdx+A0H], raxbut only with AVX or on arm64 | 
| Bencmarks: Memsetpublic unsafe class MemsetBenchmarks
{
    private static readonly byte[] Data1 = new byte[1024];
    [Benchmark] public void Memset8() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 8);
    [Benchmark] public void Memset10() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 10);
    [Benchmark] public void Memset14() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 14);
    [Benchmark] public void Memset16() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 16);
    [Benchmark] public void Memset17() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 17);
    [Benchmark] public void Memset20() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 20);
    [Benchmark] public void Memset32() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 32);
    [Benchmark] public void Memset33() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 33);
    [Benchmark] public void Memset40() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 40);
    [Benchmark] public void Memset50() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 50);
    [Benchmark] public void Memset64() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 64);
    [Benchmark] public void Memset65() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 65);
    [Benchmark] public void Memset80() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 80);
    [Benchmark] public void Memset90() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 90);
    [Benchmark] public void Memset110() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 110);
    [Benchmark] public void Memset128() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 128);
    [Benchmark] public void Memset129() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 129);
    [Benchmark] public void Memset200() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 200);
    [Benchmark] public void Memset256() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 256);
    [Benchmark] public void Memset257() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 257);
    [Benchmark] public void Memset300() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 300);
    [Benchmark] public void Memset400() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 400);
    [Benchmark] public void Memset512() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 512);
}Memcpypublic unsafe class MemcpyBenchmarks
{
    private static readonly byte[] Data1 = new byte[1024];
    private static readonly byte[] Data2 = new byte[1024];
    [Benchmark] public void Memcpy8() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 8);
    [Benchmark] public void Memcpy10() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 10);
    [Benchmark] public void Memcpy14() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 14);
    [Benchmark] public void Memcpy16() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 16);
    [Benchmark] public void Memcpy17() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 17);
    [Benchmark] public void Memcpy20() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 20);
    [Benchmark] public void Memcpy32() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 32);
    [Benchmark] public void Memcpy33() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 33);
    [Benchmark] public void Memcpy40() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 40);
    [Benchmark] public void Memcpy50() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 50);
    [Benchmark] public void Memcpy64() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 64);
    [Benchmark] public void Memcpy65() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 65);
    [Benchmark] public void Memcpy80() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 80);
    [Benchmark] public void Memcpy90() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 90);
    [Benchmark] public void Memcpy110() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 110);
    [Benchmark] public void Memcpy128() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 128);
    [Benchmark] public void Memcpy129() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 129);
    [Benchmark] public void Memcpy200() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 200);
    [Benchmark] public void Memcpy256() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 256);
    [Benchmark] public void Memcpy257() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 257);
    [Benchmark] public void Memcpy300() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 300);
    [Benchmark] public void Memcpy400() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 400);
    [Benchmark] public void Memcpy512() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 512);
}Verified on: Core i7 8700k, Core i9 9980HK, planning to test on Ryzen 7950X | 
| @dotnet/jit-contrib PTAL Visible things this PR fixes: 
 Diffs are not too big outside of coreclr_tests collection - around +2k-3k for libraries.pmi: diffs To improve some of them I filed: 
 A typical size regression looks like this:         mov      qword ptr [rbp-C8H], rdx
        mov      rdx, bword ptr [rbp+18H]
        ; byrRegs +[rdx]
-       lea      rcx, bword ptr [rbp-B8H]
-       ; byrRegs +[rcx]
-       mov      r8d, 80
-       call     CORINFO_HELP_MEMCPY
-       ; byrRegs -[rcx rdx]
+       vmovdqu  ymm0, ymmword ptr[rdx]
+       vmovdqu  ymmword ptr[rbp-B8H], ymm0
+       vmovdqu  ymm0, ymmword ptr[rdx+20H]
+       vmovdqu  ymmword ptr[rbp-98H], ymm0
+       vmovdqu  xmm0, xmmword ptr [rdx+40H]
+       vmovdqu  xmmword ptr [rbp-78H], xmm0
        mov      rdx, qword ptr [rbp-C0H]
+       ; byrRegs -[rdx]
        mov      r8, qword ptr [rbp-C8H]
        lea      r9, [rbp-B8H]
        lea      rcx, [rbp-40H]which is 2x faster on all machines I tested. There are several cases where unrolling produces more compact code than  Diffs are mostly negative for ARM64, e.g.: | 
| @tannergooding @dotnet/jit-contrib PTAL | 




The current limits were a bit odd, e.g. hard limit of 128 bytes on x64 no matter if it supports AVX or not (2x less instructions).
Also, the new limits overall match whatever native compilers do for memset/memcpy unroll in -Os (size-aware): https://godbolt.org/z/dW1qqaP9a
Closes #82529