Skip to content

Use splat parameter to put Tuples in large Array constants#15495

Merged
straight-shoota merged 1 commit intocrystal-lang:masterfrom
HertzDevil:refactor/large-array-put-splat
Feb 22, 2025
Merged

Use splat parameter to put Tuples in large Array constants#15495
straight-shoota merged 1 commit intocrystal-lang:masterfrom
HertzDevil:refactor/large-array-put-splat

Conversation

@HertzDevil
Copy link
Contributor

We use a special put function instead of an array literal for populating large Array constants (see #4516), since this eliminates the individual allocas for each array element. However, this is only true for simple literals, and tuple literals are not one of them. The following snippet:

def put(array : Array, values) : Nil
  array << values
end

N = 2

def foo
  data = Array({Int32, Int32, Int32}).new(N)
  {% for _ in 0...N %}
    put(data, {1, 2, 3})
  {% end %}
end

foo

produces:

LLVM IR
define internal void @"*foo:Nil"() #0 {
alloca:
  %data = alloca ptr, align 8
  %0 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  %1 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  %2 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@Array(T)::new<Int32>:Array(Tuple(Int32, Int32, Int32))"(i32 844, i32 2)
  store ptr %2, ptr %data, align 8
  %3 = load ptr, ptr %data, align 8
  %4 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 0
  store i32 1, ptr %4, align 4
  %5 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 1
  store i32 2, ptr %5, align 4
  %6 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 2
  store i32 3, ptr %6, align 4
  %7 = load %"Tuple(Int32, Int32, Int32)", ptr %0, align 4
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %3, %"Tuple(Int32, Int32, Int32)" %7)
  %8 = load ptr, ptr %data, align 8
  %9 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 0
  store i32 1, ptr %9, align 4
  %10 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 1
  store i32 2, ptr %10, align 4
  %11 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 2
  store i32 3, ptr %11, align 4
  %12 = load %"Tuple(Int32, Int32, Int32)", ptr %1, align 4
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %8, %"Tuple(Int32, Int32, Int32)" %12)
  ret void
}

; Function Attrs: uwtable
define internal void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %array, %"Tuple(Int32, Int32, Int32)" %values) #0 {
alloca:
  %values1 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  store %"Tuple(Int32, Int32, Int32)" %values, ptr %values1, align 4
  %0 = load %"Tuple(Int32, Int32, Int32)", ptr %values1, align 4
  %1 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@Array(T)#<<<Tuple(Int32, Int32, Int32)>:Array(Tuple(Int32, Int32, Int32))"(ptr %array, %"Tuple(Int32, Int32, Int32)" %0)
  ret void
}

whereas this:

def put(array : Array, *values) : Nil
  array << values
end

N = 2

def foo
  data = Array({Int32, Int32, Int32}).new(N)
  {% for _ in 0...N %}
    put(data, 1, 2, 3)
  {% end %}
end

foo

produces:

LLVM IR
; Function Attrs: uwtable
define internal void @"*foo:Nil"() #0 {
alloca:
  %data = alloca ptr, align 8
  br label %entry

entry:                                            ; preds = %alloca
  %0 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@Array(T)::new<Int32>:Array(Tuple(Int32, Int32, Int32))"(i32 844, i32 2)
  store ptr %0, ptr %data, align 8
  %1 = load ptr, ptr %data, align 8
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %1, i32 1, i32 2, i32 3)
  %2 = load ptr, ptr %data, align 8
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %2, i32 1, i32 2, i32 3)
  ret void
}

; Function Attrs: uwtable
define internal void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %array, i32 %__temp_1436, i32 %__temp_1437, i32 %__temp_1438) #0 {
alloca:
  %values = alloca %"Tuple(Int32, Int32, Int32)", align 8
  %0 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  %1 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 0
  store i32 %__temp_1436, ptr %1, align 4
  %2 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 1
  store i32 %__temp_1437, ptr %2, align 4
  %3 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 2
  store i32 %__temp_1438, ptr %3, align 4
  %4 = load %"Tuple(Int32, Int32, Int32)", ptr %0, align 4
  store %"Tuple(Int32, Int32, Int32)" %4, ptr %values, align 4
  %5 = load %"Tuple(Int32, Int32, Int32)", ptr %values, align 4
  %6 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@Array(T)#<<<Tuple(Int32, Int32, Int32)>:Array(Tuple(Int32, Int32, Int32))"(ptr %array, %"Tuple(Int32, Int32, Int32)" %5)
  ret void
}

The alloca and getelementptr instructions are moved into the put function. If we set N = 10000 instead, this subtle change could reduce the bytecode generation phase's time for this snippet by as much as 0.7s on my Windows machine.

Copy link
Member

@straight-shoota straight-shoota left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is quite intriguing because one would expect that both variants essentially do the same thing and there's not much room for semantic difference. But apparently there is...

@straight-shoota straight-shoota added this to the 1.16.0 milestone Feb 21, 2025
@straight-shoota straight-shoota merged commit 2998ccf into crystal-lang:master Feb 22, 2025
34 checks passed
@HertzDevil HertzDevil deleted the refactor/large-array-put-splat branch February 22, 2025 13:07
kojix2 pushed a commit to kojix2/crystal that referenced this pull request Feb 23, 2025
…al-lang#15495)

We use a special `put` function instead of an array literal for populating large `Array` constants (see crystal-lang#4516), since this eliminates the individual `alloca`s for each array element. However, this is only true for simple literals, and tuple literals are not one of them. The following snippet:

```crystal
def put(array : Array, values) : Nil
  array << values
end

N = 2

def foo
  data = Array({Int32, Int32, Int32}).new(N)
  {% for _ in 0...N %}
    put(data, {1, 2, 3})
  {% end %}
end

foo
```

produces:

```llvm
define internal void @"*foo:Nil"() #0 {
alloca:
  %data = alloca ptr, align 8
  %0 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  %1 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  %2 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@array(T)::new<Int32>:Array(Tuple(Int32, Int32, Int32))"(i32 844, i32 2)
  store ptr %2, ptr %data, align 8
  %3 = load ptr, ptr %data, align 8
  %4 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 0
  store i32 1, ptr %4, align 4
  %5 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 1
  store i32 2, ptr %5, align 4
  %6 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 2
  store i32 3, ptr %6, align 4
  %7 = load %"Tuple(Int32, Int32, Int32)", ptr %0, align 4
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %3, %"Tuple(Int32, Int32, Int32)" %7)
  %8 = load ptr, ptr %data, align 8
  %9 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 0
  store i32 1, ptr %9, align 4
  %10 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 1
  store i32 2, ptr %10, align 4
  %11 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %1, i32 0, i32 2
  store i32 3, ptr %11, align 4
  %12 = load %"Tuple(Int32, Int32, Int32)", ptr %1, align 4
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %8, %"Tuple(Int32, Int32, Int32)" %12)
  ret void
}

; Function Attrs: uwtable
define internal void @"*put<Array(Tuple(Int32, Int32, Int32)), Tuple(Int32, Int32, Int32)>:Nil"(ptr %array, %"Tuple(Int32, Int32, Int32)" %values) #0 {
alloca:
  %values1 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  store %"Tuple(Int32, Int32, Int32)" %values, ptr %values1, align 4
  %0 = load %"Tuple(Int32, Int32, Int32)", ptr %values1, align 4
  %1 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@array(T)#<<<Tuple(Int32, Int32, Int32)>:Array(Tuple(Int32, Int32, Int32))"(ptr %array, %"Tuple(Int32, Int32, Int32)" %0)
  ret void
}
```

whereas this:

```crystal
def put(array : Array, *values) : Nil
  array << values
end

N = 2

def foo
  data = Array({Int32, Int32, Int32}).new(N)
  {% for _ in 0...N %}
    put(data, 1, 2, 3)
  {% end %}
end

foo
```

produces:

```llvm
; Function Attrs: uwtable
define internal void @"*foo:Nil"() #0 {
alloca:
  %data = alloca ptr, align 8
  br label %entry

entry:                                            ; preds = %alloca
  %0 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@array(T)::new<Int32>:Array(Tuple(Int32, Int32, Int32))"(i32 844, i32 2)
  store ptr %0, ptr %data, align 8
  %1 = load ptr, ptr %data, align 8
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %1, i32 1, i32 2, i32 3)
  %2 = load ptr, ptr %data, align 8
  call void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %2, i32 1, i32 2, i32 3)
  ret void
}

; Function Attrs: uwtable
define internal void @"*put<Array(Tuple(Int32, Int32, Int32)), Int32, Int32, Int32>:Nil"(ptr %array, i32 %__temp_1436, i32 %__temp_1437, i32 %__temp_1438) #0 {
alloca:
  %values = alloca %"Tuple(Int32, Int32, Int32)", align 8
  %0 = alloca %"Tuple(Int32, Int32, Int32)", align 8
  br label %entry

entry:                                            ; preds = %alloca
  %1 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 0
  store i32 %__temp_1436, ptr %1, align 4
  %2 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 1
  store i32 %__temp_1437, ptr %2, align 4
  %3 = getelementptr inbounds %"Tuple(Int32, Int32, Int32)", ptr %0, i32 0, i32 2
  store i32 %__temp_1438, ptr %3, align 4
  %4 = load %"Tuple(Int32, Int32, Int32)", ptr %0, align 4
  store %"Tuple(Int32, Int32, Int32)" %4, ptr %values, align 4
  %5 = load %"Tuple(Int32, Int32, Int32)", ptr %values, align 4
  %6 = call ptr @"*Array(Tuple(Int32, Int32, Int32))@array(T)#<<<Tuple(Int32, Int32, Int32)>:Array(Tuple(Int32, Int32, Int32))"(ptr %array, %"Tuple(Int32, Int32, Int32)" %5)
  ret void
}
```

The `alloca` and `getelementptr` instructions are moved into the `put` function. If we set `N = 10000` instead, this subtle change could reduce the bytecode generation phase's time for this snippet by as much as 0.7s on my Windows machine.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants