Skip to content

SROA optimizations depend on alloca type #164308

@gbaraldi

Description

@gbaraldi

While debugging a regression in julia's codegen I saw this interesting behaviour. https://godbolt.org/z/KeEnhjc6G

I wasn't able to minimize it too much more than that sadly, it seems brittle, but basically SROA generates meaningfully different code depending on the type of the alloca, because SROA runs quite early for us (and in general) this ends up making a big difference in how LLVM optimizes this code, generating either

define swiftcc float @julia_f_3398(ptr nonnull swiftself "gcstack" %0, ptr nocapture noundef nonnull readonly align 4 dereferenceable(8) %1) local_unnamed_addr #0 {
  %3 = getelementptr inbounds nuw i8, ptr %0, i64 16
  %4 = load ptr, ptr %3, align 8, !tbaa !2
  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
  %6 = load atomic ptr, ptr %5 monotonic, align 8, !tbaa !6, !invariant.load !8
  fence syncscope("singlethread") seq_cst
  %7 = load volatile i64, ptr %6, align 8
  fence syncscope("singlethread") seq_cst
  %.sroa.06.0.copyload7 = load i64, ptr %1, align 4, !tbaa !9, !alias.scope !10, !noalias !14
  %.sroa.3.0.extract.shift12 = lshr i64 %.sroa.06.0.copyload7, 32
  %.sroa.3.0.extract.trunc13 = trunc nuw i64 %.sroa.3.0.extract.shift12 to i32
  %8 = bitcast i32 %.sroa.3.0.extract.trunc13 to float
  br label %9

9:                                                ; preds = %16, %2
  %10 = phi float [ %8, %2 ], [ %18, %16 ]
  %11 = phi i64 [ 1, %2 ], [ %17, %16 ]
  %.sroa.06.014 = phi i64 [ %.sroa.06.0.copyload7, %2 ], [ %.sroa.0.1, %16 ]
  %12 = and i64 %11, 1
  %.not = icmp eq i64 %12, 0
  br i1 %.not, label %19, label %16

13:                                               ; preds = %16
  %.sroa.010.0.extract.trunc = trunc i64 %.sroa.0.1 to i32
  %14 = bitcast i32 %.sroa.010.0.extract.trunc to float
  %15 = fadd float %14, %18
  ret float %15

16:                                               ; preds = %19, %9
  %.sroa.0.1 = phi i64 [ %.sroa.09.0.insert.insert, %19 ], [ %.sroa.06.014, %9 ]
  %17 = add nuw nsw i64 %11, 1
  %.sroa.3.0.extract.shift = lshr i64 %.sroa.0.1, 32
  %.sroa.3.0.extract.trunc = trunc nuw i64 %.sroa.3.0.extract.shift to i32
  %18 = bitcast i32 %.sroa.3.0.extract.trunc to float
  %exitcond = icmp eq i64 %17, 1001
  br i1 %exitcond, label %13, label %9

19:                                               ; preds = %9
  %20 = fadd float %10, 1.000000e+00
  %21 = bitcast float %20 to i32
  %.sroa.2.0.insert.ext = zext i32 %21 to i64
  %.sroa.2.0.insert.shift = shl nuw i64 %.sroa.2.0.insert.ext, 32
  %.sroa.09.0.insert.ext = and i64 %.sroa.06.014, 4294967295
  %.sroa.09.0.insert.insert = or disjoint i64 %.sroa.2.0.insert.shift, %.sroa.09.0.insert.ext
  br label %16
}

or

define swiftcc float @julia_f_4090(ptr nonnull swiftself %0, ptr nocapture noundef nonnull readonly align 4 dereferenceable(8) %1) local_unnamed_addr #0 {
  %3 = getelementptr inbounds nuw i8, ptr %0, i64 16
  %4 = load ptr, ptr %3, align 8, !tbaa !3
  %5 = getelementptr inbounds nuw i8, ptr %4, i64 16
  %6 = load ptr, ptr %5, align 8, !tbaa !7, !invariant.load !9
  fence syncscope("singlethread") seq_cst
  call void @julia.safepoint(ptr %6)
  fence syncscope("singlethread") seq_cst
  %.sroa.013.0.copyload = load float, ptr %1, align 4, !tbaa !10, !alias.scope !11, !noalias !15
  %.sroa.614.0..sroa_idx = getelementptr inbounds nuw i8, ptr %1, i64 4
  %.sroa.614.0.copyload = load float, ptr %.sroa.614.0..sroa_idx, align 4, !tbaa !10, !alias.scope !11, !noalias !15
  br label %7

7:                                                ; preds = %7, %2
  %8 = phi i64 [ 1, %2 ], [ %11, %7 ]
  %.sroa.614.015 = phi float [ %.sroa.614.0.copyload, %2 ], [ %spec.select, %7 ]
  %9 = and i64 %8, 1
  %.not = icmp eq i64 %9, 0
  %10 = fadd float %.sroa.614.015, 1.000000e+00
  %spec.select = select i1 %.not, float %10, float %.sroa.614.015
  %11 = add nuw nsw i64 %8, 1
  %exitcond = icmp eq i64 %11, 1001
  br i1 %exitcond, label %12, label %7

12:                                               ; preds = %7
  %13 = fadd float %.sroa.013.0.copyload, %10
  ret float %13
}

This second option is significantly faster to execute on the cpu this was targetting (apple-m3)

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions