diff --git a/src/codegen.cpp b/src/codegen.cpp index c8865aa668d37..ebfd1446d1011 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2292,9 +2292,11 @@ static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, unsigned nb, Align alig // if it cannot find something better to do, which is terrible for performance. // However, if we emit this with an element size equal to the alignment, it will instead split it into aligned chunks // which is great for performance and vectorization. - if (alignTo(nb, align) == align.value()) // don't bother with making an array of length 1 - return emit_static_alloca(ctx, ctx.builder.getIntNTy(align.value() * 8), align); - return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(align.value() * 8), alignTo(nb, align) / align.value()), align); + // Cap element size at 64 bits since not all backends support larger integers. + unsigned elsize = std::min(align.value(), (uint64_t)8); + if (alignTo(nb, elsize) == elsize) // don't bother with making an array of length 1 + return emit_static_alloca(ctx, ctx.builder.getIntNTy(elsize * 8), align); + return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(elsize * 8), alignTo(nb, elsize) / elsize), align); } static AllocaInst *emit_static_roots(jl_codectx_t &ctx, unsigned nroots) diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp index 63f1d0a786448..2881a8c42c1c7 100644 --- a/src/llvm-alloc-opt.cpp +++ b/src/llvm-alloc-opt.cpp @@ -678,11 +678,8 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF // The allocation does not escape or get used in a phi node so none of the derived // SSA from it are live when we run the allocation again. // It is now safe to promote the allocation to an entry block alloca. - size_t align = 1; - // TODO: This is overly conservative. May want to instead pass this as a - // parameter to the allocation function directly. - if (sz > 1) - align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz)); + // Inherit alignment from the original allocation, with GC alignment as minimum. + Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT)); // No debug info for prolog instructions IRBuilder<> prolog_builder(&F.getEntryBlock().front()); AllocaInst *buff; @@ -698,17 +695,21 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF const DataLayout &DL = F.getParent()->getDataLayout(); auto asize = ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz / DL.getTypeAllocSize(pass.T_prjlvalue)); buff = prolog_builder.CreateAlloca(pass.T_prjlvalue, asize); - buff->setAlignment(Align(align)); + buff->setAlignment(align); ptr = cast(buff); } else { + // Use alignment-sized chunks so SROA splits the alloca into aligned pieces + // which is better for performance and vectorization (see emit_static_alloca). + // Cap element size at 64 bits since not all backends support larger integers. Type *buffty; - if (pass.DL->isLegalInteger(sz * 8)) - buffty = Type::getIntNTy(pass.getLLVMContext(), sz * 8); + unsigned elsize = std::min(align.value(), (uint64_t)8); + if (alignTo(sz, elsize) == elsize) + buffty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8); else - buffty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), sz); + buffty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(sz, elsize) / elsize); buff = prolog_builder.CreateAlloca(buffty); - buff->setAlignment(Align(align)); + buff->setAlignment(align); ptr = cast(buff); } insertLifetime(ptr, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz), orig_inst); @@ -979,6 +980,8 @@ void Optimizer::splitOnStack(CallInst *orig_inst) uint32_t size; }; SmallVector slots; + // Inherit alignment from the original allocation, with GC alignment as minimum. + Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT)); for (auto memop: use_info.memops) { auto offset = memop.first; auto &field = memop.second; @@ -994,12 +997,18 @@ void Optimizer::splitOnStack(CallInst *orig_inst) else if (field.elty && !field.multiloc) { allocty = field.elty; } - else if (pass.DL->isLegalInteger(field.size * 8)) { - allocty = Type::getIntNTy(pass.getLLVMContext(), field.size * 8); - } else { - allocty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), field.size); + else { + // Use alignment-sized chunks so SROA splits the alloca into aligned pieces + // which is better for performance and vectorization (see emit_static_alloca). + // Cap element size at 64 bits since not all backends support larger integers. + unsigned elsize = std::min(align.value(), (uint64_t)8); + if (alignTo(field.size, elsize) == elsize) + allocty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8); + else + allocty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(field.size, elsize) / elsize); } slot.slot = prolog_builder.CreateAlloca(allocty); + slot.slot->setAlignment(align); IRBuilder<> builder(orig_inst); insertLifetime(slot.slot, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), field.size), orig_inst); initializeAlloca(builder, slot.slot, use_info.allockind); diff --git a/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll b/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll index b96c9385e38eb..c66cb815ea8b9 100644 --- a/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll +++ b/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll @@ -16,7 +16,7 @@ declare {}* @julia.pointer_from_objref({} addrspace(11)*) ; CHECK-LABEL: @non_zero_addrspace -; OPAQUE: %var1 = alloca i32, align 8, addrspace(5) +; OPAQUE: %var1 = alloca i64, align 16, addrspace(5) ; OPAQUE: %1 = addrspacecast ptr addrspace(5) %var1 to ptr ; OPAQUE: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %var1) diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll index f53a4d5c01df7..44714b702d7bc 100644 --- a/test/llvmpasses/alloc-opt-gcframe.ll +++ b/test/llvmpasses/alloc-opt-gcframe.ll @@ -25,7 +25,8 @@ define {} addrspace(10)* @return_obj() { ; CHECK-LABEL: }{{$}} ; CHECK-LABEL: @return_load -; CHECK: alloca i64 +; When the element type is known (i64), splitOnStack preserves it +; CHECK: alloca i64, align 16 ; CHECK-NOT: @julia.gc_alloc_obj ; CHECK-NOT: @jl_gc_small_alloc ; OPAQUE: call void @llvm.lifetime.start{{.*}}(i64 8, ptr @@ -62,7 +63,7 @@ define void @ccall_obj(i8* %fptr) { ; CHECK-LABEL: }{{$}} ; CHECK-LABEL: @ccall_ptr -; CHECK: alloca i64 +; CHECK: alloca i64, align 16 ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj ; CHECK-NOT: @jl_gc_small_alloc @@ -105,7 +106,7 @@ define void @ccall_unknown_bundle(i8* %fptr) { ; CHECK-LABEL: }{{$}} ; CHECK-LABEL: @lifetime_branches -; CHECK: alloca i64 +; CHECK: alloca i64, align 16 ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK: L1: ; CHECK-NEXT: call void @llvm.lifetime.start{{.*}}(i64 8, @@ -166,7 +167,7 @@ define void @object_field({} addrspace(10)* %field) { ; CHECK-LABEL: }{{$}} ; CHECK-LABEL: @memcpy_opt -; CHECK: alloca [16 x i8], align 16 +; CHECK: alloca [2 x i64], align 16 ; OPAQUE: call ptr @julia.get_pgcstack() ; CHECK-NOT: @julia.gc_alloc_obj ; CHECK-NOT: @jl_gc_small_alloc diff --git a/test/llvmpasses/alloc-opt-pass.ll b/test/llvmpasses/alloc-opt-pass.ll index c6c279ae36fc6..97f998f7593ca 100644 --- a/test/llvmpasses/alloc-opt-pass.ll +++ b/test/llvmpasses/alloc-opt-pass.ll @@ -79,8 +79,9 @@ declare ptr addrspace(10) @external_function2() ; CHECK-LABEL: @legal_int_types -; CHECK: alloca [12 x i8] -; CHECK-NOT: alloca i96 +; Test that allocations use i64 chunks (capped at 64 bits for backend compatibility) +; A 12-byte allocation rounds up to 16 bytes, giving [2 x i64] +; CHECK: alloca [2 x i64], align 16 ; CHECK: call void @llvm.memset.p0.i64(ptr align 16 %var1, ; CHECK: ret void define void @legal_int_types() { @@ -151,11 +152,10 @@ define void @lifetime_no_preserve_end(ptr noalias nocapture noundef nonnull sret ; CHECK-LABEL: @initializers -; CHECK: alloca [1 x i8] -; CHECK-DAG: alloca [2 x i8] -; CHECK-DAG: alloca [3 x i8] -; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 1 %var1, -; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 4 %var7, +; Small allocations (1, 2, 3 bytes) all round up to 8 bytes, giving i64 +; CHECK-DAG: alloca i64, align 16 +; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var1, +; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var7, ; CHECK: ret void define void @initializers() { %pgcstack = call ptr @julia.get_pgcstack() @@ -268,6 +268,37 @@ define swiftcc i64 @"atomicrmw"(ptr nonnull swiftself "gcstack" %0) #0 { ret i64 %19 } +; Test that higher alignment from the original allocation is inherited +; 8 bytes with 32-byte alignment uses i64 (element size capped at 64 bits) +; CHECK-LABEL: @align_inherit +; CHECK: alloca i64, align 32 +; CHECK: ret void +define void @align_inherit() { + %pgcstack = call ptr @julia.get_pgcstack() + %ptls = call ptr @julia.ptls_states() + %ptls_i8 = bitcast ptr %ptls to ptr + %var1 = call align 32 ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag) + %var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11) + %var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2) + ret void +} +; CHECK-LABEL: }{{$}} + +; Test that 8-byte allocation uses i64 with GC alignment +; CHECK-LABEL: @legal_int_i64 +; CHECK: alloca i64, align 16 +; CHECK: ret void +define void @legal_int_i64() { + %pgcstack = call ptr @julia.get_pgcstack() + %ptls = call ptr @julia.ptls_states() + %ptls_i8 = bitcast ptr %ptls to ptr + %var1 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag) + %var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11) + %var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2) + ret void +} +; CHECK-LABEL: }{{$}} + declare ptr @julia.ptls_states() declare ptr @julia.pointer_from_objref(ptr addrspace(11))