[X86] Align i128 to 16 bytes in x86 datalayouts

This is an attempt at rebooting https://reviews.llvm.org/D28990 I've included AutoUpgrade changes to modify the data layout to satisfy the compatible layout check. But this does mean alloca, loads, stores, etc in old IR will automatically get this new alignment. This should fix PR46320. Reviewed By: echristo, rnk, tmgross Differential Revision: https://reviews.llvm.org/D86310
llvm · Oct 11, 2023 · a21abc7 · a21abc7
1 parent 84ff49d
commit a21abc7
Show file tree

Hide file tree

Showing 47 changed files with 614 additions and 958 deletions.
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
@@ -817,10 +817,10 @@ class LLVM_LIBRARY_VISIBILITY NaClTargetInfo : public OSTargetInfo<Target> {
       // Handled in ARM's setABI().
     } else if (Triple.getArch() == llvm::Triple::x86) {
       this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-n8:16:32-S128");
+                            "i64:64-i128:128-n8:16:32-S128");
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
       this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-n8:16:32:64-S128");
+                            "i64:64-i128:128-n8:16:32:64-S128");
     } else if (Triple.getArch() == llvm::Triple::mipsel) {
       // Handled on mips' setDataLayout.
     } else {

diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
@@ -431,13 +431,12 @@ class LLVM_LIBRARY_VISIBILITY X86_32TargetInfo : public X86TargetInfo {
     LongDoubleWidth = 96;
     LongDoubleAlign = 32;
     SuitableAlign = 128;
-    resetDataLayout(
-        Triple.isOSBinFormatMachO()
-            ? "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-              "f80:32-n8:16:32-S128"
-            : "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-              "f80:32-n8:16:32-S128",
-        Triple.isOSBinFormatMachO() ? "_" : "");
+    resetDataLayout(Triple.isOSBinFormatMachO()
+                        ? "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:"
+                          "128-f64:32:64-f80:32-n8:16:32-S128"
+                        : "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:"
+                          "128-f64:32:64-f80:32-n8:16:32-S128",
+                    Triple.isOSBinFormatMachO() ? "_" : "");
     SizeType = UnsignedInt;
     PtrDiffType = SignedInt;
     IntPtrType = SignedInt;
@@ -542,8 +541,9 @@ class LLVM_LIBRARY_VISIBILITY DarwinI386TargetInfo
       UseSignedCharForObjCBool = false;
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
-    resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-                    "f80:128-n8:16:32-S128", "_");
+    resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-"
+                    "f64:32:64-f80:128-n8:16:32-S128",
+                    "_");
     HasAlignMac68kSupport = true;
   }
 
@@ -570,7 +570,7 @@ class LLVM_LIBRARY_VISIBILITY WindowsX86_32TargetInfo
         getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
     bool IsMSVC = getTriple().isWindowsMSVCEnvironment();
     std::string Layout = IsWinCOFF ? "e-m:x" : "e-m:e";
-    Layout += "-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-";
+    Layout += "-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-";
     Layout += IsMSVC ? "f80:128" : "f80:32";
     Layout += "-n8:16:32-a:0:32-S32";
     resetDataLayout(Layout, IsWinCOFF ? "_" : "");
@@ -621,8 +621,8 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_32TargetInfo : public X86_32TargetInfo {
       : X86_32TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
     DoubleAlign = LongLongAlign = 64;
-    resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:"
-                    "32-n8:16:32-a:0:32-S32",
+    resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-"
+                    "i128:128-f80:32-n8:16:32-a:0:32-S32",
                     "_");
   }
 
@@ -660,8 +660,8 @@ class LLVM_LIBRARY_VISIBILITY MCUX86_32TargetInfo : public X86_32TargetInfo {
       : X86_32TargetInfo(Triple, Opts) {
     LongDoubleWidth = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble();
-    resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:32-f64:"
-                    "32-f128:32-n8:16:32-a:0:32-S32");
+    resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:32-"
+                    "f64:32-f128:32-n8:16:32-a:0:32-S32");
     WIntType = UnsignedInt;
   }
 
@@ -721,11 +721,11 @@ class LLVM_LIBRARY_VISIBILITY X86_64TargetInfo : public X86TargetInfo {
 
     // Pointers are 32-bit in x32.
     resetDataLayout(IsX32 ? "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-f80:128-n8:16:32:64-S128"
-                          : IsWinCOFF ? "e-m:w-p270:32:32-p271:32:32-p272:64:"
-                                        "64-i64:64-f80:128-n8:16:32:64-S128"
-                                      : "e-m:e-p270:32:32-p271:32:32-p272:64:"
-                                        "64-i64:64-f80:128-n8:16:32:64-S128");
+                            "i64:64-i128:128-f80:128-n8:16:32:64-S128"
+                    : IsWinCOFF ? "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:"
+                                  "64-i128:128-f80:128-n8:16:32:64-S128"
+                                : "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:"
+                                  "64-i128:128-f80:128-n8:16:32:64-S128");
 
     // Use fpret only for long double.
     RealTypeUsesObjCFPRetMask = (unsigned)FloatModeKind::LongDouble;
@@ -922,8 +922,9 @@ class LLVM_LIBRARY_VISIBILITY DarwinX86_64TargetInfo
     llvm::Triple T = llvm::Triple(Triple);
     if (T.isiOS())
       UseSignedCharForObjCBool = false;
-    resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:"
-                    "16:32:64-S128", "_");
+    resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+                    "f80:128-n8:16:32:64-S128",
+                    "_");
   }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,

diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
@@ -1,26 +1,26 @@
 // RUN: %clang_cc1 -triple i686-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-UNKNOWN %s
-// I686-UNKNOWN: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
+// I686-UNKNOWN: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple i686-apple-darwin9 -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-DARWIN %s
-// I686-DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:128-n8:16:32-S128"
+// I686-DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple i686-unknown-win32 -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-WIN32 %s
-// I686-WIN32: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-a:0:32-S32"
+// I686-WIN32: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
 
 // RUN: %clang_cc1 -triple i686-unknown-cygwin -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-CYGWIN %s
-// I686-CYGWIN: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
+// I686-CYGWIN: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:32-n8:16:32-a:0:32-S32"
 
 // RUN: %clang_cc1 -triple i686-pc-macho -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-MACHO %s
-// I686-MACHO: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
+// I686-MACHO: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=X86_64 %s
-// X86_64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+// X86_64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 
 // RUN: %clang_cc1 -triple xcore-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=XCORE %s
@@ -92,11 +92,11 @@
 
 // RUN: %clang_cc1 -triple i686-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=I686-NACL
-// I686-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-n8:16:32-S128"
+// I686-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple x86_64-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=X86_64-NACL
-// X86_64-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-n8:16:32:64-S128"
+// X86_64-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32:64-S128"
 
 // RUN: %clang_cc1 -triple arm-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=ARM-NACL

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
@@ -129,6 +129,11 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+* The ``i128`` type now matches GCC and clang's ``__int128`` type. This mainly
+  benefits external projects such as Rust which aim to be binary compatible
+  with C, but also fixes code generation where LLVM already assumed that the
+  type matched and called into libgcc helper functions.
+
 Changes to the OCaml bindings
 -----------------------------
 

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5201,13 +5201,29 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   // If the datalayout matches the expected format, add pointer size address
   // spaces to the datalayout.
   std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
-  if (!DL.contains(AddrSpaces)) {
+  if (StringRef Ref = Res; !Ref.contains(AddrSpaces)) {
     SmallVector<StringRef, 4> Groups;
     Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
-    if (R.match(DL, &Groups))
+    if (R.match(Res, &Groups))
       Res = (Groups[1] + AddrSpaces + Groups[3]).str();
   }
 
+  // i128 values need to be 16-byte-aligned. LLVM already called into libgcc
+  // for i128 operations prior to this being reflected in the data layout, and
+  // clang mostly produced LLVM IR that already aligned i128 to 16 byte
+  // boundaries, so although this is a breaking change, the upgrade is expected
+  // to fix more IR than it breaks.
+  // Intel MCU is an exception and uses 4-byte-alignment.
+  if (!T.isOSIAMCU()) {
+    std::string I128 = "-i128:128";
+    if (StringRef Ref = Res; !Ref.contains(I128)) {
+      SmallVector<StringRef, 4> Groups;
+      Regex R("^(e(-[mpi][^-]*)*)((-[^mpi][^-]*)*)$");
+      if (R.match(Res, &Groups))
+        Res = (Groups[1] + I128 + Groups[3]).str();
+    }
+  }
+
   // For 32-bit MSVC targets, raise the alignment of f80 values to 16 bytes.
   // Raising the alignment is safe because Clang did not produce f80 values in
   // the MSVC environment before this upgrade was added.

diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -130,12 +130,14 @@ static std::string computeDataLayout(const Triple &TT) {
   Ret += "-p270:32:32-p271:32:32-p272:64:64";
 
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  // 128 bit integers are not specified in the 32-bit ABIs but are used
+  // internally for lowering f128, so we match the alignment to that.
   if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
-    Ret += "-i64:64";
+    Ret += "-i64:64-i128:128";
   else if (TT.isOSIAMCU())
     Ret += "-i64:32-f64:32";
   else
-    Ret += "-f64:32:64";
+    Ret += "-i128:128-f64:32:64";
 
   // Some ABIs align long double to 128 bits, others to 32.
   if (TT.isOSNaCl() || TT.isOSIAMCU())

diff --git a/llvm/test/Bitcode/upgrade-datalayout.ll b/llvm/test/Bitcode/upgrade-datalayout.ll
@@ -5,5 +5,5 @@
 target datalayout = "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Bitcode/upgrade-datalayout2.ll b/llvm/test/Bitcode/upgrade-datalayout2.ll
@@ -2,6 +2,12 @@
 ; match a possible x86 datalayout.
 ;
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
+;
+; XFAIL: *
+; No implementation of the data layout upgrade ever checked whether the data
+; layout was a possible x86 data layout, so the logic that this test aims to
+; check was never implemented. We always upgraded data layouts that were not
+; possible x86 data layouts, we merely did not previously upgrade this one.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

diff --git a/llvm/test/Bitcode/upgrade-datalayout3.ll b/llvm/test/Bitcode/upgrade-datalayout3.ll
@@ -5,4 +5,4 @@
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
-; CHECK: target datalayout = "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-S32"
+; CHECK: target datalayout = "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-S32"
diff --git a/llvm/test/Bitcode/upgrade-datalayout4.ll b/llvm/test/Bitcode/upgrade-datalayout4.ll
@@ -5,4 +5,4 @@
 target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc"
 
-; CHECK: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-a:0:32-S32"
+; CHECK: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
diff --git a/llvm/test/Bitcode/upgrade-datalayout5.ll b/llvm/test/Bitcode/upgrade-datalayout5.ll
@@ -0,0 +1,8 @@
+; Test to make sure datalayout is automatically upgraded.
+;
+; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-pc-linux-gnu"
+
+; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
diff --git a/llvm/test/CodeGen/X86/AMX/amx-config.ll b/llvm/test/CodeGen/X86/AMX/amx-config.ll
@@ -79,10 +79,10 @@ define <4 x i32> @test_api(i32 %0, i16 signext %1, i16 signext %2, <4 x i32> %xm
 ; AVX1-LABEL: test_api:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)

diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -186,8 +186,8 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    andl $-8, %esp
-; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $48, %esp
 ; CHECK-NEXT:    movl 12(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl 16(%ebp), %ebx

diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -182,12 +182,10 @@ define i128 @or128(ptr %p) {
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    andl $-8, %esp
-; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    .cfi_offset %esi, -16
-; X86-SSE2-NEXT:    .cfi_offset %edi, -12
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $32, %esp
+; X86-SSE2-NEXT:    .cfi_offset %esi, -12
 ; X86-SSE2-NEXT:    movl 8(%ebp), %esi
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
@@ -198,18 +196,11 @@ define i128 @or128(ptr %p) {
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SSE2-NEXT:    addl $20, %esp
-; X86-SSE2-NEXT:    movl (%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl %edi, 8(%esi)
-; X86-SSE2-NEXT:    movl %edx, 12(%esi)
-; X86-SSE2-NEXT:    movl %eax, (%esi)
-; X86-SSE2-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE2-NEXT:    movaps (%esp), %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, (%esi)
 ; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    leal -8(%ebp), %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl $4
@@ -223,7 +214,7 @@ define i128 @or128(ptr %p) {
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SLM-NEXT:    pushl %edi
 ; X86-SLM-NEXT:    pushl %esi
-; X86-SLM-NEXT:    andl $-8, %esp
+; X86-SLM-NEXT:    andl $-16, %esp
 ; X86-SLM-NEXT:    subl $16, %esp
 ; X86-SLM-NEXT:    .cfi_offset %esi, -16
 ; X86-SLM-NEXT:    .cfi_offset %edi, -12
@@ -263,7 +254,7 @@ define i128 @or128(ptr %p) {
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
-; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    .cfi_offset %esi, -16
 ; X86-ATOM-NEXT:    .cfi_offset %edi, -12
@@ -528,8 +519,8 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SSE2-NEXT:    andl $-8, %esp
-; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $32, %esp
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
@@ -551,8 +542,8 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-SLM-NEXT:    .cfi_offset %ebp, -8
 ; X86-SLM-NEXT:    movl %esp, %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SLM-NEXT:    andl $-8, %esp
-; X86-SLM-NEXT:    subl $16, %esp
+; X86-SLM-NEXT:    andl $-16, %esp
+; X86-SLM-NEXT:    subl $32, %esp
 ; X86-SLM-NEXT:    movl 8(%ebp), %eax
 ; X86-SLM-NEXT:    movl %esp, %ecx
 ; X86-SLM-NEXT:    pushl $0
@@ -575,7 +566,7 @@ define void @or128_nouse_seq_cst(ptr %p) {
 ; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    leal (%esp), %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
-; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    movl 8(%ebp), %eax
 ; X86-ATOM-NEXT:    movl %esp, %ecx