JuliaLang · oscardssmith · Jan 21, 2026 · Jan 15, 2026 · Jan 16, 2026 · Jan 20, 2026
diff --git a/src/codegen.cpp b/src/codegen.cpp
@@ -2292,9 +2292,11 @@ static AllocaInst *emit_static_alloca(jl_codectx_t &ctx, unsigned nb, Align alig
     // if it cannot find something better to do, which is terrible for performance.
     // However, if we emit this with an element size equal to the alignment, it will instead split it into aligned chunks
     // which is great for performance and vectorization.
-    if (alignTo(nb, align) == align.value()) // don't bother with making an array of length 1
-        return emit_static_alloca(ctx, ctx.builder.getIntNTy(align.value() * 8), align);
-    return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(align.value() * 8), alignTo(nb, align) / align.value()), align);
+    // Cap element size at 64 bits since not all backends support larger integers.
+    unsigned elsize = std::min(align.value(), (uint64_t)8);
+    if (alignTo(nb, elsize) == elsize) // don't bother with making an array of length 1
+        return emit_static_alloca(ctx, ctx.builder.getIntNTy(elsize * 8), align);
+    return emit_static_alloca(ctx, ArrayType::get(ctx.builder.getIntNTy(elsize * 8), alignTo(nb, elsize) / elsize), align);
 }
 
 static AllocaInst *emit_static_roots(jl_codectx_t &ctx, unsigned nroots)

diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp
@@ -678,11 +678,8 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
     // The allocation does not escape or get used in a phi node so none of the derived
     // SSA from it are live when we run the allocation again.
     // It is now safe to promote the allocation to an entry block alloca.
-    size_t align = 1;
-    // TODO: This is overly conservative. May want to instead pass this as a
-    //       parameter to the allocation function directly.
-    if (sz > 1)
-        align = MinAlign(JL_SMALL_BYTE_ALIGNMENT, NextPowerOf2(sz));
+    // Inherit alignment from the original allocation, with GC alignment as minimum.
+    Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT));
     // No debug info for prolog instructions
     IRBuilder<> prolog_builder(&F.getEntryBlock().front());
     AllocaInst *buff;
@@ -698,17 +695,21 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
         const DataLayout &DL = F.getParent()->getDataLayout();
         auto asize = ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz / DL.getTypeAllocSize(pass.T_prjlvalue));
         buff = prolog_builder.CreateAlloca(pass.T_prjlvalue, asize);
-        buff->setAlignment(Align(align));
+        buff->setAlignment(align);
         ptr = cast<Instruction>(buff);
     }
     else {
+        // Use alignment-sized chunks so SROA splits the alloca into aligned pieces
+        // which is better for performance and vectorization (see emit_static_alloca).
+        // Cap element size at 64 bits since not all backends support larger integers.
         Type *buffty;
-        if (pass.DL->isLegalInteger(sz * 8))
-            buffty = Type::getIntNTy(pass.getLLVMContext(), sz * 8);
+        unsigned elsize = std::min(align.value(), (uint64_t)8);
+        if (alignTo(sz, elsize) == elsize)
+            buffty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8);
         else
-            buffty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), sz);
+            buffty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(sz, elsize) / elsize);
         buff = prolog_builder.CreateAlloca(buffty);
-        buff->setAlignment(Align(align));
+        buff->setAlignment(align);
         ptr = cast<Instruction>(buff);
     }
     insertLifetime(ptr, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), sz), orig_inst);
@@ -979,6 +980,8 @@ void Optimizer::splitOnStack(CallInst *orig_inst)
         uint32_t size;
     };
     SmallVector<SplitSlot,8> slots;
+    // Inherit alignment from the original allocation, with GC alignment as minimum.
+    Align align(std::max((unsigned)orig_inst->getRetAlign().valueOrOne().value(), (unsigned)JL_SMALL_BYTE_ALIGNMENT));
     for (auto memop: use_info.memops) {
         auto offset = memop.first;
         auto &field = memop.second;
@@ -994,12 +997,18 @@ void Optimizer::splitOnStack(CallInst *orig_inst)
         else if (field.elty && !field.multiloc) {
             allocty = field.elty;
         }
-        else if (pass.DL->isLegalInteger(field.size * 8)) {
-            allocty = Type::getIntNTy(pass.getLLVMContext(), field.size * 8);
-        } else {
-            allocty = ArrayType::get(Type::getInt8Ty(pass.getLLVMContext()), field.size);
+        else {
+            // Use alignment-sized chunks so SROA splits the alloca into aligned pieces
+            // which is better for performance and vectorization (see emit_static_alloca).
+            // Cap element size at 64 bits since not all backends support larger integers.
+            unsigned elsize = std::min(align.value(), (uint64_t)8);
+            if (alignTo(field.size, elsize) == elsize)
+                allocty = Type::getIntNTy(pass.getLLVMContext(), elsize * 8);
+            else
+                allocty = ArrayType::get(Type::getIntNTy(pass.getLLVMContext(), elsize * 8), alignTo(field.size, elsize) / elsize);
         }
         slot.slot = prolog_builder.CreateAlloca(allocty);
+        slot.slot->setAlignment(align);
         IRBuilder<> builder(orig_inst);
         insertLifetime(slot.slot, ConstantInt::get(Type::getInt64Ty(prolog_builder.getContext()), field.size), orig_inst);
         initializeAlloca(builder, slot.slot, use_info.allockind);

diff --git a/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll b/test/llvmpasses/alloc-opt-gcframe-addrspaces.ll
@@ -16,7 +16,7 @@ declare {}* @julia.pointer_from_objref({} addrspace(11)*)
 
 ; CHECK-LABEL: @non_zero_addrspace
 
-; OPAQUE: %var1 = alloca i32, align 8, addrspace(5)
+; OPAQUE: %var1 = alloca i64, align 16, addrspace(5)
 ; OPAQUE: %1 = addrspacecast ptr addrspace(5) %var1 to ptr
 ; OPAQUE: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %var1)
 

diff --git a/test/llvmpasses/alloc-opt-gcframe.ll b/test/llvmpasses/alloc-opt-gcframe.ll
@@ -25,7 +25,8 @@ define {} addrspace(10)* @return_obj() {
 ; CHECK-LABEL: }{{$}}
 
 ; CHECK-LABEL: @return_load
-; CHECK: alloca i64
+; When the element type is known (i64), splitOnStack preserves it
+; CHECK: alloca i64, align 16
 ; CHECK-NOT: @julia.gc_alloc_obj
 ; CHECK-NOT: @jl_gc_small_alloc
 ; OPAQUE: call void @llvm.lifetime.start{{.*}}(i64 8, ptr
@@ -62,7 +63,7 @@ define void @ccall_obj(i8* %fptr) {
 ; CHECK-LABEL: }{{$}}
 
 ; CHECK-LABEL: @ccall_ptr
-; CHECK: alloca i64
+; CHECK: alloca i64, align 16
 ; OPAQUE: call ptr @julia.get_pgcstack()
 ; CHECK-NOT: @julia.gc_alloc_obj
 ; CHECK-NOT: @jl_gc_small_alloc
@@ -105,7 +106,7 @@ define void @ccall_unknown_bundle(i8* %fptr) {
 ; CHECK-LABEL: }{{$}}
 
 ; CHECK-LABEL: @lifetime_branches
-; CHECK: alloca i64
+; CHECK: alloca i64, align 16
 ; OPAQUE: call ptr @julia.get_pgcstack()
 ; CHECK: L1:
 ; CHECK-NEXT: call void @llvm.lifetime.start{{.*}}(i64 8,
@@ -166,7 +167,7 @@ define void @object_field({} addrspace(10)* %field) {
 ; CHECK-LABEL: }{{$}}
 
 ; CHECK-LABEL: @memcpy_opt
-; CHECK: alloca [16 x i8], align 16
+; CHECK: alloca [2 x i64], align 16
 ; OPAQUE: call ptr @julia.get_pgcstack()
 ; CHECK-NOT: @julia.gc_alloc_obj
 ; CHECK-NOT: @jl_gc_small_alloc

diff --git a/test/llvmpasses/alloc-opt-pass.ll b/test/llvmpasses/alloc-opt-pass.ll
@@ -79,8 +79,9 @@ declare ptr addrspace(10) @external_function2()
 
 
 ; CHECK-LABEL: @legal_int_types
-; CHECK: alloca [12 x i8]
-; CHECK-NOT: alloca i96
+; Test that allocations use i64 chunks (capped at 64 bits for backend compatibility)
+; A 12-byte allocation rounds up to 16 bytes, giving [2 x i64]
+; CHECK: alloca [2 x i64], align 16
 ; CHECK: call void @llvm.memset.p0.i64(ptr align 16 %var1,
 ; CHECK: ret void
 define void @legal_int_types() {
@@ -151,11 +152,10 @@ define void @lifetime_no_preserve_end(ptr noalias nocapture noundef nonnull sret
 
 
 ; CHECK-LABEL: @initializers
-; CHECK: alloca [1 x i8]
-; CHECK-DAG: alloca [2 x i8]
-; CHECK-DAG: alloca [3 x i8]
-; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 1 %var1,
-; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 4 %var7,
+; Small allocations (1, 2, 3 bytes) all round up to 8 bytes, giving i64
+; CHECK-DAG: alloca i64, align 16
+; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var1,
+; CHECK-DAG: call void @llvm.memset.p0.i64(ptr align 16 %var7,
 ; CHECK: ret void
 define void @initializers() {
   %pgcstack = call ptr @julia.get_pgcstack()
@@ -268,6 +268,37 @@ define swiftcc i64 @"atomicrmw"(ptr nonnull swiftself "gcstack" %0) #0 {
   ret i64 %19
 }
 
+; Test that higher alignment from the original allocation is inherited
+; 8 bytes with 32-byte alignment uses i64 (element size capped at 64 bits)
+; CHECK-LABEL: @align_inherit
+; CHECK: alloca i64, align 32
+; CHECK: ret void
+define void @align_inherit() {
+  %pgcstack = call ptr @julia.get_pgcstack()
+  %ptls = call ptr @julia.ptls_states()
+  %ptls_i8 = bitcast ptr %ptls to ptr
+  %var1 = call align 32 ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag)
+  %var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11)
+  %var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2)
+  ret void
+}
+; CHECK-LABEL: }{{$}}
+
+; Test that 8-byte allocation uses i64 with GC alignment
+; CHECK-LABEL: @legal_int_i64
+; CHECK: alloca i64, align 16
+; CHECK: ret void
+define void @legal_int_i64() {
+  %pgcstack = call ptr @julia.get_pgcstack()
+  %ptls = call ptr @julia.ptls_states()
+  %ptls_i8 = bitcast ptr %ptls to ptr
+  %var1 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 8, ptr addrspace(10) @tag)
+  %var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11)
+  %var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2)
+  ret void
+}
+; CHECK-LABEL: }{{$}}
+
 declare ptr @julia.ptls_states()
 
 declare ptr @julia.pointer_from_objref(ptr addrspace(11))