SROA optimizations depend on alloca type

While debugging a regression in julia's codegen I saw this interesting behaviour. https://godbolt.org/z/KeEnhjc6G

I wasn't able to minimize it too much more than that sadly, it seems brittle, but basically SROA generates meaningfully different code depending on the type of the alloca, because SROA runs quite early for us (and in general) this ends up making a big difference in how LLVM optimizes this code, generating either 
```llvm
define swiftcc float @julia_f_3398(ptr nonnull swiftself "gcstack" %0, ptr nocapture noundef nonnull readonly align 4 dereferenceable(8) %1) local_unnamed_addr #0 {
  %3 = getelementptr inbounds nuw i8, ptr %0, i64 16
  %4 = load ptr, ptr %3, align 8, !tbaa !2
  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
  %6 = load atomic ptr, ptr %5 monotonic, align 8, !tbaa !6, !invariant.load !8
  fence syncscope("singlethread") seq_cst
  %7 = load volatile i64, ptr %6, align 8
  fence syncscope("singlethread") seq_cst
  %.sroa.06.0.copyload7 = load i64, ptr %1, align 4, !tbaa !9, !alias.scope !10, !noalias !14
  %.sroa.3.0.extract.shift12 = lshr i64 %.sroa.06.0.copyload7, 32
  %.sroa.3.0.extract.trunc13 = trunc nuw i64 %.sroa.3.0.extract.shift12 to i32
  %8 = bitcast i32 %.sroa.3.0.extract.trunc13 to float
  br label %9

9:                                                ; preds = %16, %2
  %10 = phi float [ %8, %2 ], [ %18, %16 ]
  %11 = phi i64 [ 1, %2 ], [ %17, %16 ]
  %.sroa.06.014 = phi i64 [ %.sroa.06.0.copyload7, %2 ], [ %.sroa.0.1, %16 ]
  %12 = and i64 %11, 1
  %.not = icmp eq i64 %12, 0
  br i1 %.not, label %19, label %16

13:                                               ; preds = %16
  %.sroa.010.0.extract.trunc = trunc i64 %.sroa.0.1 to i32
  %14 = bitcast i32 %.sroa.010.0.extract.trunc to float
  %15 = fadd float %14, %18
  ret float %15

16:                                               ; preds = %19, %9
  %.sroa.0.1 = phi i64 [ %.sroa.09.0.insert.insert, %19 ], [ %.sroa.06.014, %9 ]
  %17 = add nuw nsw i64 %11, 1
  %.sroa.3.0.extract.shift = lshr i64 %.sroa.0.1, 32
  %.sroa.3.0.extract.trunc = trunc nuw i64 %.sroa.3.0.extract.shift to i32
  %18 = bitcast i32 %.sroa.3.0.extract.trunc to float
  %exitcond = icmp eq i64 %17, 1001
  br i1 %exitcond, label %13, label %9

19:                                               ; preds = %9
  %20 = fadd float %10, 1.000000e+00
  %21 = bitcast float %20 to i32
  %.sroa.2.0.insert.ext = zext i32 %21 to i64
  %.sroa.2.0.insert.shift = shl nuw i64 %.sroa.2.0.insert.ext, 32
  %.sroa.09.0.insert.ext = and i64 %.sroa.06.014, 4294967295
  %.sroa.09.0.insert.insert = or disjoint i64 %.sroa.2.0.insert.shift, %.sroa.09.0.insert.ext
  br label %16
}
```
or
```llvm
define swiftcc float @julia_f_4090(ptr nonnull swiftself %0, ptr nocapture noundef nonnull readonly align 4 dereferenceable(8) %1) local_unnamed_addr #0 {
  %3 = getelementptr inbounds nuw i8, ptr %0, i64 16
  %4 = load ptr, ptr %3, align 8, !tbaa !3
  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
  %6 = load ptr, ptr %5, align 8, !tbaa !7, !invariant.load !9
  fence syncscope("singlethread") seq_cst
  call void @julia.safepoint(ptr %6)
  fence syncscope("singlethread") seq_cst
  %.sroa.013.0.copyload = load float, ptr %1, align 4, !tbaa !10, !alias.scope !11, !noalias !15
  %.sroa.614.0..sroa_idx = getelementptr inbounds nuw i8, ptr %1, i64 4
  %.sroa.614.0.copyload = load float, ptr %.sroa.614.0..sroa_idx, align 4, !tbaa !10, !alias.scope !11, !noalias !15
  br label %7

7:                                                ; preds = %7, %2
  %8 = phi i64 [ 1, %2 ], [ %11, %7 ]
  %.sroa.614.015 = phi float [ %.sroa.614.0.copyload, %2 ], [ %spec.select, %7 ]
  %9 = and i64 %8, 1
  %.not = icmp eq i64 %9, 0
  %10 = fadd float %.sroa.614.015, 1.000000e+00
  %spec.select = select i1 %.not, float %10, float %.sroa.614.015
  %11 = add nuw nsw i64 %8, 1
  %exitcond = icmp eq i64 %11, 1001
  br i1 %exitcond, label %12, label %7

12:                                               ; preds = %7
  %13 = fadd float %.sroa.013.0.copyload, %10
  ret float %13
}
```
This second option is significantly faster to execute on the cpu this was targetting (apple-m3)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

SROA optimizations depend on alloca type #164308

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

SROA optimizations depend on alloca type #164308

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions