Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2292,9 +2292,11 @@ static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, unsigned nb, Align alig
// if it cannot find something better to do, which is terrible for performance.
// However, if we emit this with an element size equal to the alignment, it will instead split it into aligned chunks
// which is great for performance and vectorization.
if (alignTo(nb, align) == align.value()) // don't bother with making an array of length 1
return emit_static_alloca(ctx, ctx.builder.getIntNTy(align.value() * 8), align);
return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(align.value() * 8), alignTo(nb, align) / align.value()), align);
// Cap element size at 64 bits since not all backends support larger integers.
unsigned elsize = std::min(align.value(), (uint64_t)8);
if (alignTo(nb, elsize) == elsize) // don't bother with making an array of length 1
return emit_static_alloca(ctx, ctx.builder.getIntNTy(elsize * 8), align);
return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(elsize * 8), alignTo(nb, elsize) / elsize), align);
}

static AllocaInst *emit_static_roots(jl_codectx_t &ctx, unsigned nroots)
Expand Down
37 changes: 23 additions & 14 deletions src/llvm-alloc-opt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,11 +678,8 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
// The allocation does not escape or get used in a phi node so none of the derived
// SSA from it are live when we run the allocation again.
// It is now safe to promote the allocation to an entry block alloca.
size_t align = 1;
// TODO: This is overly conservative. May want to instead pass this as a
// parameter to the allocation function directly.
if (sz > 1)
align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz));
// Inherit alignment from the original allocation, with GC alignment as minimum.
Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This feels loosely unsound, since this is the minimum known alignment, and not the required alignment. The JL_SMALL_BYTE_ALIGNMENT value is the largest value that julia.gc_alloc_obj is permitted to return, so it is sometimes reasonable that we can use this as a hint, but we should be sure to clarify that this overalignment is merely a hint to the layout (although being more than 16 will penalize performance since it requires a more expensive stack adjustment on entry)

    (unsigned)orig_inst->getRetAlign().valueOrOne().value()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if the allocation required a larger alignment wouldn't we inherit that, that's why we prefer to inherit and just baseline to gc align

Copy link
Member

@vtjnash vtjnash Jan 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getRetAlign is not the required alignment, it is the minimum, so if it requires it, this would introduce a bug here

but that said, the function isn't capable of giving more than JL_SMALL_BYTE_ALIGNMENT (16) so having getRetAlign return more than 16 here would be a miscompile, so this always gives the correct answer anyways (and increasing from there is only a runtime performance penalty, not a correctness issue)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s the alignment we emitted no? If it required more it would already be a bug no? Unless you mean a gc alloc aligned that wouldn’t tell LLVM

// No debug info for prolog instructions
IRBuilder<> prolog_builder(&F.getEntryBlock().front());
AllocaInst *buff;
Expand All @@ -698,17 +695,21 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
const DataLayout &DL = F.getParent()->getDataLayout();
auto asize = ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz / DL.getTypeAllocSize(pass.T_prjlvalue));
buff = prolog_builder.CreateAlloca(pass.T_prjlvalue, asize);
buff->setAlignment(Align(align));
buff->setAlignment(align);
ptr = cast<Instruction>(buff);
}
else {
// Use alignment-sized chunks so SROA splits the alloca into aligned pieces
// which is better for performance and vectorization (see emit_static_alloca).
// Cap element size at 64 bits since not all backends support larger integers.
Type *buffty;
if (pass.DL->isLegalInteger(sz * 8))
buffty = Type::getIntNTy(pass.getLLVMContext(), sz * 8);
unsigned elsize = std::min(align.value(), (uint64_t)8);
if (alignTo(sz, elsize) == elsize)
buffty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8);
else
buffty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), sz);
buffty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(sz, elsize) / elsize);
buff = prolog_builder.CreateAlloca(buffty);
buff->setAlignment(Align(align));
buff->setAlignment(align);
ptr = cast<Instruction>(buff);
}
insertLifetime(ptr, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz), orig_inst);
Expand Down Expand Up @@ -979,6 +980,8 @@ void Optimizer::splitOnStack(CallInst *orig_inst)
uint32_t size;
};
SmallVector<SplitSlot,8> slots;
// Inherit alignment from the original allocation, with GC alignment as minimum.
Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT));
for (auto memop: use_info.memops) {
auto offset = memop.first;
auto &field = memop.second;
Expand All @@ -994,12 +997,18 @@ void Optimizer::splitOnStack(CallInst *orig_inst)
else if (field.elty && !field.multiloc) {
allocty = field.elty;
}
else if (pass.DL->isLegalInteger(field.size * 8)) {
allocty = Type::getIntNTy(pass.getLLVMContext(), field.size * 8);
} else {
allocty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), field.size);
else {
// Use alignment-sized chunks so SROA splits the alloca into aligned pieces
// which is better for performance and vectorization (see emit_static_alloca).
// Cap element size at 64 bits since not all backends support larger integers.
unsigned elsize = std::min(align.value(), (uint64_t)8);
if (alignTo(field.size, elsize) == elsize)
allocty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8);
else
allocty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(field.size, elsize) / elsize);
}
slot.slot = prolog_builder.CreateAlloca(allocty);
slot.slot->setAlignment(align);
IRBuilder<> builder(orig_inst);
insertLifetime(slot.slot, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), field.size), orig_inst);
initializeAlloca(builder, slot.slot, use_info.allockind);
Expand Down
2 changes: 1 addition & 1 deletion test/llvmpasses/alloc-opt-gcframe-addrspaces.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ declare {}* @julia.pointer_from_objref({} addrspace(11)*)

; CHECK-LABEL: @non_zero_addrspace

; OPAQUE: %var1 = alloca i32, align 8, addrspace(5)
; OPAQUE: %var1 = alloca i64, align 16, addrspace(5)
; OPAQUE: %1 = addrspacecast ptr addrspace(5) %var1 to ptr
; OPAQUE: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %var1)

Expand Down
9 changes: 5 additions & 4 deletions test/llvmpasses/alloc-opt-gcframe.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ define {} addrspace(10)* @return_obj() {
; CHECK-LABEL: }{{$}}

; CHECK-LABEL: @return_load
; CHECK: alloca i64
; When the element type is known (i64), splitOnStack preserves it
; CHECK: alloca i64, align 16
; CHECK-NOT: @julia.gc_alloc_obj
; CHECK-NOT: @jl_gc_small_alloc
; OPAQUE: call void @llvm.lifetime.start{{.*}}(i64 8, ptr
Expand Down Expand Up @@ -62,7 +63,7 @@ define void @ccall_obj(i8* %fptr) {
; CHECK-LABEL: }{{$}}

; CHECK-LABEL: @ccall_ptr
; CHECK: alloca i64
; CHECK: alloca i64, align 16
; OPAQUE: call ptr @julia.get_pgcstack()
; CHECK-NOT: @julia.gc_alloc_obj
; CHECK-NOT: @jl_gc_small_alloc
Expand Down Expand Up @@ -105,7 +106,7 @@ define void @ccall_unknown_bundle(i8* %fptr) {
; CHECK-LABEL: }{{$}}

; CHECK-LABEL: @lifetime_branches
; CHECK: alloca i64
; CHECK: alloca i64, align 16
; OPAQUE: call ptr @julia.get_pgcstack()
; CHECK: L1:
; CHECK-NEXT: call void @llvm.lifetime.start{{.*}}(i64 8,
Expand Down Expand Up @@ -166,7 +167,7 @@ define void @object_field({} addrspace(10)* %field) {
; CHECK-LABEL: }{{$}}

; CHECK-LABEL: @memcpy_opt
; CHECK: alloca [16 x i8], align 16
; CHECK: alloca [2 x i64], align 16
; OPAQUE: call ptr @julia.get_pgcstack()
; CHECK-NOT: @julia.gc_alloc_obj
; CHECK-NOT: @jl_gc_small_alloc
Expand Down
45 changes: 38 additions & 7 deletions test/llvmpasses/alloc-opt-pass.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ declare ptr addrspace(10) @external_function2()


; CHECK-LABEL: @legal_int_types
; CHECK: alloca [12 x i8]
; CHECK-NOT: alloca i96
; Test that allocations use i64 chunks (capped at 64 bits for backend compatibility)
; A 12-byte allocation rounds up to 16 bytes, giving [2 x i64]
; CHECK: alloca [2 x i64], align 16
; CHECK: call void @llvm.memset.p0.i64(ptr align 16 %var1,
; CHECK: ret void
define void @legal_int_types() {
Expand Down Expand Up @@ -151,11 +152,10 @@ define void @lifetime_no_preserve_end(ptr noalias nocapture noundef nonnull sret


; CHECK-LABEL: @initializers
; CHECK: alloca [1 x i8]
; CHECK-DAG: alloca [2 x i8]
; CHECK-DAG: alloca [3 x i8]
; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 1 %var1,
; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 4 %var7,
; Small allocations (1, 2, 3 bytes) all round up to 8 bytes, giving i64
; CHECK-DAG: alloca i64, align 16
; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var1,
; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var7,
; CHECK: ret void
define void @initializers() {
%pgcstack = call ptr @julia.get_pgcstack()
Expand Down Expand Up @@ -268,6 +268,37 @@ define swiftcc i64 @"atomicrmw"(ptr nonnull swiftself "gcstack" %0) #0 {
ret i64 %19
}

; Test that higher alignment from the original allocation is inherited
; 8 bytes with 32-byte alignment uses i64 (element size capped at 64 bits)
; CHECK-LABEL: @align_inherit
; CHECK: alloca i64, align 32
; CHECK: ret void
define void @align_inherit() {
%pgcstack = call ptr @julia.get_pgcstack()
%ptls = call ptr @julia.ptls_states()
%ptls_i8 = bitcast ptr %ptls to ptr
%var1 = call align 32 ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag)
%var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11)
%var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2)
ret void
}
; CHECK-LABEL: }{{$}}

; Test that 8-byte allocation uses i64 with GC alignment
; CHECK-LABEL: @legal_int_i64
; CHECK: alloca i64, align 16
; CHECK: ret void
define void @legal_int_i64() {
%pgcstack = call ptr @julia.get_pgcstack()
%ptls = call ptr @julia.ptls_states()
%ptls_i8 = bitcast ptr %ptls to ptr
%var1 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag)
%var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11)
%var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2)
ret void
}
; CHECK-LABEL: }{{$}}

declare ptr @julia.ptls_states()

declare ptr @julia.pointer_from_objref(ptr addrspace(11))
Expand Down