Skip to content

[JIT] Unable to elide bound checks from Creating/Copying vectors from/to spans #121987

@En3Tho

Description

@En3Tho

Few examples:

public static void Sum(Span<int> values, Span<int> output)
    {
        // assume avx2
        if (values.Length < 16 || output.Length < 8)
        {
            return;
        }

        var v1 = Vector256.Create(values);
        var v2 = Vector256.Create(values[8..]);

        (v1 + v2).CopyTo(output);
    }

    public static void Sum2(Span<int> values, Span<int> output)
    {
        if (values.Length < Vector256<int>.Count * 2 || output.Length < Vector256<int>.Count)
        {
            return;
        }

        var v1 = Vector256.Create(values);
        var v2 = Vector256.Create(values[8..]);

        (v1 + v2).CopyTo(output);
    }

    public static void Sum3(Span<int> values, Span<int> output)
    {        
        if (values.Length < Vector<int>.Count * 2 || output.Length < Vector<int>.Count)
        {
            return;
        }

        var v1 = Vector.Create(values);
        var v2 = Vector.Create(values[8..]);

        (v1 + v2).CopyTo(output);
    }

All these result in the same assembly on my machine (Ryzen 5950x, AVX2):

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       mov      eax, dword ptr [rcx+0x08]
       cmp      eax, 16
       jl       SHORT G_M000_IG04
 
G_M000_IG03:                ;; offset=0x000C
       mov      r8d, dword ptr [rdx+0x08]
       cmp      r8d, 8
       jl       SHORT G_M000_IG04
       mov      rcx, bword ptr [rcx]
       mov      r8, rcx
       vmovups  ymm0, ymmword ptr [r8]
       lea      r8d, [rax-0x08]
       mov      r10d, r8d
       add      r10, 8
       mov      eax, eax
       cmp      r10, rax
       ja       SHORT G_M000_IG05
       add      rcx, 32
       cmp      r8d, 8
       jl       SHORT G_M000_IG06
       vpaddd   ymm0, ymm0, ymmword ptr [rcx]
       mov      rcx, bword ptr [rdx]
       vmovups  ymmword ptr [rcx], ymm0
 
G_M000_IG04:                ;; offset=0x0048
       vzeroupper 
       add      rsp, 40
       ret      
 
G_M000_IG05:                ;; offset=0x0050
       call     [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
       int3     
 
G_M000_IG06:                ;; offset=0x0057
       mov      ecx, 6
       call     [System.ThrowHelper:ThrowArgumentOutOfRangeException(int)]
       int3

Ideally those bound checks should be elided and codegen should be closer to the following:

G_M000_IG01:                ;; offset=0x0000
 
G_M000_IG02:                ;; offset=0x0000
       mov      rax, bword ptr [rcx]
       mov      ecx, dword ptr [rcx+0x08]
       cmp      ecx, 16
       jl       SHORT G_M000_IG04
 
G_M000_IG03:                ;; offset=0x000B
       cmp      dword ptr [rdx+0x08], 8
       jl       SHORT G_M000_IG04
       vmovups  ymm0, ymmword ptr [rax]
       vmovups  ymm1, ymmword ptr [rax+0x20]
       mov      rax, bword ptr [rdx]
       vpaddd   ymm0, ymm0, ymm1
       vmovups  ymmword ptr [rax], ymm0
 
G_M000_IG04:                ;; offset=0x0025
       vzeroupper 
       ret

Metadata

Metadata

Assignees

Labels

area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMItenet-performancePerformance related issue

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions