diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index c3cf714ce6c1..2dc562932845 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -19,6 +19,7 @@
 PROJECT_DEPENDENCIES = {
     "llvm": set(),
     "clang": {"llvm"},
+    "CIR": {"clang", "mlir"},
     "bolt": {"clang", "lld", "llvm"},
     "clang-tools-extra": {"clang", "llvm"},
     "compiler-rt": {"clang", "lld"},
@@ -55,6 +56,7 @@
     ".ci": {
         "llvm",
         "clang",
+        "CIR",
         "lld",
         "lldb",
         "bolt",
@@ -128,6 +130,7 @@
     "lldb": "check-lldb",
     "llvm": "check-llvm",
     "clang": "check-clang",
+    "CIR": "check-clang-cir",
     "bolt": "check-bolt",
     "lld": "check-lld",
     "flang": "check-flang",
@@ -141,6 +144,23 @@
 
 RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"}
 
+# Meta projects are projects that need explicit handling but do not reside
+# in their own top level folder. To add a meta project, the start of the path
+# for the metaproject should be mapped to the name of the project below.
+# Multiple paths can map to the same metaproject.
+META_PROJECTS = {
+    ("clang", "lib", "CIR"): "CIR",
+    ("clang", "test", "CIR"): "CIR",
+    ("clang", "include", "clang", "CIR"): "CIR",
+    ("*", "docs"): "docs",
+    ("llvm", "utils", "gn"): "gn",
+    (".github", "workflows", "premerge.yaml"): ".ci",
+    ("third-party",): ".ci",
+}
+
+# Projects that should not run any tests. These need to be metaprojects.
+SKIP_PROJECTS = ["docs", "gn"]
+
 
 def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
     projects_with_dependents = set(projects)
@@ -233,21 +253,34 @@ def _compute_runtimes_to_build(
     return _exclude_projects(runtimes_to_build, platform)
 
 
+def _path_matches(matcher: tuple[str], file_path: tuple[str]) -> bool:
+    if len(file_path) < len(matcher):
+        return False
+    for match_part, file_part in zip(matcher, file_path):
+        if match_part == "*" or file_part == "*":
+            continue
+        if match_part != file_part:
+            return False
+    return True
+
+
+def _get_modified_projects_for_file(modified_file: str) -> Set[str]:
+    modified_projects = set()
+    path_parts = pathlib.Path(modified_file).parts
+    for meta_project_files in META_PROJECTS.keys():
+        if _path_matches(meta_project_files, path_parts):
+            meta_project = META_PROJECTS[meta_project_files]
+            if meta_project in SKIP_PROJECTS:
+                return set()
+            modified_projects.add(meta_project)
+    modified_projects.add(pathlib.Path(modified_file).parts[0])
+    return modified_projects
+
+
 def _get_modified_projects(modified_files: list[str]) -> Set[str]:
     modified_projects = set()
     for modified_file in modified_files:
-        path_parts = pathlib.Path(modified_file).parts
-        # Exclude files in the docs directory. They do not impact an test
-        # targets and there is a separate workflow used for ensuring the
-        # documentation builds.
-        if len(path_parts) > 2 and path_parts[1] == "docs":
-            continue
-        # Exclude files for the gn build. We do not test it within premerge
-        # and changes occur often enough that they otherwise take up
-        # capacity.
-        if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"):
-            continue
-        modified_projects.add(pathlib.Path(modified_file).parts[0])
+        modified_projects.update(_get_modified_projects_for_file(modified_file))
     return modified_projects
 
 
@@ -267,6 +300,13 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
         runtimes_to_test_needs_reconfig
     )
+
+    # CIR is used as a pseudo-project in this script. It is built as part of the
+    # clang build, but it requires an explicit option to enable. We set that
+    # option here, and remove it from the projects_to_build list.
+    enable_cir = "ON" if "CIR" in projects_to_build else "OFF"
+    projects_to_build.discard("CIR")
+
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -279,6 +319,7 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "runtimes_check_targets_needs_reconfig": " ".join(
             sorted(runtimes_check_targets_needs_reconfig)
         ),
+        "enable_cir": enable_cir,
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6299931e1ec3..11c4aea9b4e3 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -1,7 +1,7 @@
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Does some stuff."""
+"""Tests for compute_projects.py"""
 
 import unittest
 
@@ -104,6 +104,10 @@ def test_clang(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(
+            env_variables["enable_cir"],
+            "OFF",
+        )
 
     def test_clang_windows(self):
         env_variables = compute_projects.get_env_variables(
@@ -126,6 +130,32 @@ def test_clang_windows(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(env_variables["enable_cir"], "OFF")
+
+    def test_cir(self):
+        env_variables = compute_projects.get_env_variables(
+            ["clang/lib/CIR/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "clang;clang-tools-extra;lld;llvm;mlir",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-clang check-clang-cir check-clang-tools",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+        self.assertEqual(env_variables["enable_cir"], "ON")
 
     def test_bolt(self):
         env_variables = compute_projects.get_env_variables(
@@ -158,6 +188,7 @@ def test_mlir(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -168,10 +199,11 @@ def test_flang(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
-            ["third-party/benchmark/CMakeLists.txt"], "Linux"
+            ["llvm-libgcc/CMakeLists.txt"], "Linux"
         )
         self.assertEqual(env_variables["projects_to_build"], "")
         self.assertEqual(env_variables["project_check_targets"], "")
@@ -237,7 +269,7 @@ def test_ci(self):
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
         )
         self.assertEqual(
             env_variables["runtimes_to_build"],
@@ -276,6 +308,66 @@ def test_clang_tools_extra(self):
         self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
+    def test_premerge_workflow(self):
+        env_variables = compute_projects.get_env_variables(
+            [".github/workflows/premerge.yaml"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"],
+            "compiler-rt;libc;libcxx;libcxxabi;libunwind",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt check-libc",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+
+    def test_other_github_workflow(self):
+        env_variables = compute_projects.get_env_variables(
+            [".github/workflows/docs.yml"], "Linux"
+        )
+        self.assertEqual(env_variables["projects_to_build"], "")
+        self.assertEqual(env_variables["project_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_to_build"], "")
+        self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+
+    def test_third_party_benchmark(self):
+        env_variables = compute_projects.get_env_variables(
+            ["third-party/benchmark/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"],
+            "compiler-rt;libc;libcxx;libcxxabi;libunwind",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt check-libc",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 143e6ab4cf46..26fdeef1913a 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -1,3 +1,13 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Collects Github metrics and uploads them to Grafana.
+
+This script contains machinery that will pull metrics periodically from Github
+about workflow runs. It will upload the collected metrics to the specified
+Grafana instance.
+"""
+
 import collections
 import datetime
 import github
diff --git a/.ci/metrics/metrics_test.py b/.ci/metrics/metrics_test.py
new file mode 100644
index 000000000000..259e55f81793
--- /dev/null
+++ b/.ci/metrics/metrics_test.py
@@ -0,0 +1,75 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tests for metrics.py"""
+
+from dataclasses import dataclass
+import requests
+import unittest
+import unittest.mock
+
+import metrics
+
+
+class TestMetrics(unittest.TestCase):
+    def test_upload_gauge_metric(self):
+        """Test that we can upload a gauge metric correctly.
+
+        Also verify that we pass around parameters like API keys and user IDs
+        correctly to the HTTP POST request.
+        """
+        test_metrics = [metrics.GaugeMetric("gauge_test", 5, 1000)]
+        return_value = requests.Response()
+        return_value.status_code = 204
+        with unittest.mock.patch(
+            "requests.post", return_value=return_value
+        ) as post_mock:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+            self.assertSequenceEqual(post_mock.call_args.args, [metrics.GRAFANA_URL])
+            self.assertEqual(
+                post_mock.call_args.kwargs["data"], "gauge_test value=5 1000"
+            )
+            self.assertEqual(
+                post_mock.call_args.kwargs["auth"], ("test_userid", "test_api_key")
+            )
+
+    def test_upload_job_metric(self):
+        """Test that we can upload a job metric correctly."""
+        test_metrics = [
+            metrics.JobMetrics("test_job", 5, 10, 1, 1000, 7, "test_workflow")
+        ]
+        return_value = requests.Response()
+        return_value.status_code = 204
+        with unittest.mock.patch(
+            "requests.post", return_value=return_value
+        ) as post_mock:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_aoi_key")
+            self.assertEqual(
+                post_mock.call_args.kwargs["data"],
+                "test_job queue_time=5,run_time=10,status=1 1000",
+            )
+
+    def test_upload_unknown_metric(self):
+        """Test we report an error if we encounter an unknown metric type."""
+
+        @dataclass
+        class FakeMetric:
+            fake_data: str
+
+        test_metrics = [FakeMetric("test")]
+
+        with self.assertRaises(ValueError):
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+
+    def test_bad_response_code(self):
+        """Test that we gracefully handle HTTP response errors."""
+        test_metrics = [metrics.GaugeMetric("gauge_test", 5, 1000)]
+        return_value = requests.Response()
+        return_value.status_code = 403
+        # Just assert that we continue running here and do not raise anything.
+        with unittest.mock.patch("requests.post", return_value=return_value) as _:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 8d1faab13986..6db24d894eb7 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -21,12 +21,7 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 INSTALL_DIR="${BUILD_DIR}/install"
 rm -rf "${BUILD_DIR}"
 
-ccache --zero-stats
-
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing cache"
-  ccache --clear
-fi
+sccache --zero-stats
 
 mkdir -p artifacts/reproducers
 
@@ -36,7 +31,7 @@ export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers`
 function at-exit {
   retcode=$?
 
-  ccache --print-stats > artifacts/ccache_stats.txt
+  sccache --show-stats > artifacts/sccache_stats.txt
   cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
@@ -53,6 +48,7 @@ targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
 runtime_targets_needs_reconfig="${5}"
+enable_cir="${6}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -72,13 +68,15 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -G Ninja \
       -D CMAKE_PREFIX_PATH="${HOME}/.local" \
       -D CMAKE_BUILD_TYPE=Release \
+      -D CLANG_ENABLE_CIR=${enable_cir} \
       -D LLVM_ENABLE_ASSERTIONS=ON \
       -D LLVM_BUILD_EXAMPLES=ON \
       -D COMPILER_RT_BUILD_LIBFUZZER=OFF \
       -D LLVM_LIT_ARGS="${lit_args}" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D LLVM_CCACHE_BUILD=ON \
+      -D CMAKE_C_COMPILER_LAUNCHER=sccache \
+      -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D LIBCXX_CXX_ABI=libcxxabi \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D LLDB_ENABLE_PYTHON=ON \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 176350fac604..50a741677d73 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -21,11 +21,6 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 
 rm -rf "${BUILD_DIR}"
 
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing sccache"
-  rm -rf "$SCCACHE_DIR"
-fi
-
 sccache --zero-stats
 function at-exit {
   retcode=$?
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
deleted file mode 100644
index ce34d2337e9c..000000000000
--- a/.github/workflows/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Github action workflows should be stored in this directory.
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
deleted file mode 100644
index 59079f057d02..000000000000
--- a/.github/workflows/build-ci-container-windows.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-name: Build Windows CI Container
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-ci-container-windows.yml
-      - '.github/workflows/containers/github-action-ci-windows/**'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-ci-container-windows.yml
-      - '.github/workflows/containers/github-action-ci-windows/**'
-
-jobs:
-  build-ci-container-windows:
-    if: github.repository_owner == 'llvm'
-    runs-on: windows-2022
-    outputs:
-      container-name: ${{ steps.vars.outputs.container-name }}
-      container-name-tag: ${{ steps.vars.outputs.container-name-tag }}
-      container-filename: ${{ steps.vars.outputs.container-filename }}
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: .github/workflows/containers/github-action-ci-windows
-      - name: Write Variables
-        id: vars
-        run: |
-          $tag = [int64](Get-Date -UFormat %s)
-          $container_name="ghcr.io/$env:GITHUB_REPOSITORY_OWNER/ci-windows-2022"
-          echo "container-name=${container_name}" >> $env:GITHUB_OUTPUT
-          echo "container-name-tag=${container_name}:${tag}" >> $env:GITHUB_OUTPUT
-          echo "container-filename=ci-windows-${tag}.tar" >> $env:GITHUB_OUTPUT
-      - name: Build Container
-        working-directory: .github/workflows/containers/github-action-ci-windows
-        run: |
-          docker build -t ${{ steps.vars.outputs.container-name-tag }} .
-      - name: Save container image
-        run: |
-          docker save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
-      - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: container
-          path: ${{ steps.vars.outputs.container-filename }}
-          retention-days: 14
-  
-  push-ci-container:
-    if: github.event_name == 'push'
-    needs:
-      - build-ci-container-windows
-    permissions:
-      packages: write
-    runs-on: windows-2022
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-      - name: Download container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: container
-      - name: Push Container
-        run: |
-          docker load -i ${{ needs.build-ci-container-windows.outputs.container-filename }}
-          docker tag ${{ needs.build-ci-container-windows.outputs.container-name-tag }} ${{ needs.build-ci-container-windows.outputs.container-name }}:latest
-          docker login -u ${{ github.actor }} -p $env:GITHUB_TOKEN ghcr.io
-          docker push ${{ needs.build-ci-container-windows.outputs.container-name-tag }}
-          docker push ${{ needs.build-ci-container-windows.outputs.container-name }}:latest
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
deleted file mode 100644
index 3159aae32ca5..000000000000
--- a/.github/workflows/build-ci-container.yml
+++ /dev/null
@@ -1,119 +0,0 @@
-name: Build CI Container
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-ci-container.yml
-      - '.github/workflows/containers/github-action-ci/**'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-ci-container.yml
-      - '.github/workflows/containers/github-action-ci/**'
-
-jobs:
-  build-ci-container:
-    if: github.repository_owner == 'llvm'
-    runs-on: ${{ matrix.runs-on }}
-    strategy:
-      matrix:
-        include:
-          # The arch names should match the names used on dockerhub.
-          # See https://github.com/docker-library/official-images#architectures-other-than-amd64
-          - arch: amd64
-            runs-on: depot-ubuntu-24.04-16
-          - arch: arm64v8
-            runs-on: depot-ubuntu-24.04-arm-16
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: .github/workflows/containers/github-action-ci/
-      # podman is not installed by default on the ARM64 images.
-      - name: Install Podman
-        if: runner.arch == 'ARM64'
-        run: |
-          sudo apt-get install podman
-      - name: Write Variables
-        id: vars
-        run: |
-          tag=$(git rev-parse --short=12 HEAD)
-          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/${{ matrix.arch }}/ci-ubuntu-24.04"
-          echo "container-name=$container_name" >> $GITHUB_OUTPUT
-          echo "container-name-agent=$container_name-agent" >> $GITHUB_OUTPUT
-          echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
-          echo "container-name-agent-tag=$container_name-agent:$tag" >> $GITHUB_OUTPUT
-          echo "container-filename=$(echo $container_name:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-          echo "container-agent-filename=$(echo $container_name-agent:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-      - name: Build container
-        working-directory: ./.github/workflows/containers/github-action-ci/
-        run: |
-          podman build --target ci-container -t ${{ steps.vars.outputs.container-name-tag }} .
-          podman build --target ci-container-agent -t ${{ steps.vars.outputs.container-name-agent-tag }} .
-
-      # Save the container so we have it in case the push fails.  This also
-      # allows us to separate the push step into a different job so we can
-      # maintain minimal permissions while building the container.
-      - name: Save container image
-        run: |
-          podman save ${{ steps.vars.outputs.container-name-tag }}  >  ${{ steps.vars.outputs.container-filename }}
-          podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }}
-
-      - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: container-${{ matrix.arch }}
-          path: "*.tar"
-          retention-days: 14
-
-      - name: Test Container
-        run: |
-          for image in ${{ steps.vars.outputs.container-name-tag }}; do
-            # Use --pull=never to ensure we are testing the just built image.
-            podman run --pull=never --rm -it $image /usr/bin/bash -x -c 'cd $HOME && printf '\''#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }'\'' | clang++ -x c++ - && ./a.out | grep Hello'
-          done
-
-  push-ci-container:
-    if: github.event_name == 'push'
-    needs:
-      - build-ci-container
-    permissions:
-      packages: write
-    runs-on: ubuntu-24.04
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-      - name: Download container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-
-      - name: Push Container
-        run: |
-          function push_container {
-            image_name=$1
-            latest_name=$(echo $image_name | sed 's/:[a-f0-9]\+$/:latest/g')
-            podman tag $image_name $latest_name
-            echo "Pushing $image_name ..."
-            podman push $image_name
-            echo "Pushing $latest_name ..."
-            podman push $latest_name
-          }
-
-          podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          for f in $(find . -iname *.tar); do
-            image_name=$(podman load -q -i $f | sed 's/Loaded image: //g')
-            push_container $image_name
-
-            if echo $image_name | grep '/amd64/'; then
-              # For amd64, create an alias with the arch component removed.
-              # This matches the convention used on dockerhub.
-              default_image_name=$(echo $(dirname $(dirname $image_name))/$(basename $image_name))
-              podman tag $image_name $default_image_name
-              push_container $default_image_name
-            fi
-          done
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
deleted file mode 100644
index af4d599f7641..000000000000
--- a/.github/workflows/build-metrics-container.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-name: Build Metrics Container
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-metrics-container.yml
-      - '.ci/metrics/**'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - .github/workflows/build-metrics-container.yml
-      - '.ci/metrics/**'
-
-jobs:
-  build-metrics-container:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    outputs:
-      container-name: ${{ steps.vars.outputs.container-name }}
-      container-name-tag: ${{ steps.vars.outputs.container-name-tag }}
-      container-filename: ${{ steps.vars.outputs.container-filename }}
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: .ci/metrics/
-      - name: Write Variables
-        id: vars
-        run: |
-          tag=`date +%s`
-          container_name="ghcr.io/$GITHUB_REPOSITORY_OWNER/metrics"
-          echo "container-name=$container_name" >> $GITHUB_OUTPUT
-          echo "container-name-tag=$container_name:$tag" >> $GITHUB_OUTPUT
-          echo "container-filename=$(echo $container_name:$tag  | sed -e 's/\//-/g' -e 's/:/-/g').tar" >> $GITHUB_OUTPUT
-      - name: Build Container
-        working-directory: ./.ci/metrics
-        run: |
-          podman build -t ${{ steps.vars.outputs.container-name-tag }} -f Dockerfile .
-      # Save the container so we have it in case the push fails.  This also
-      # allows us to separate the push step into a different job so we can
-      # maintain minimal permissions while building the container.
-      - name: Save Container Image
-        run: |
-          podman save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
-      - name: Upload Container Image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: container
-          path: ${{ steps.vars.outputs.container-filename }}
-          retention-days: 14
-  
-  push-metrics-container:
-    if: github.event_name == 'push'
-    needs:
-      - build-metrics-container
-    permissions:
-      packages: write
-    runs-on: ubuntu-24.04
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-      - name: Download Container
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: container
-      - name: Push Container
-        run: |
-          podman load -i ${{ needs.build-metrics-container.outputs.container-filename }}
-          podman tag ${{ needs.build-metrics-container.outputs.container-name-tag }} ${{ needs.build-metrics-container.outputs.container-name }}:latest
-          podman login -u ${{ github.actor }} -p $GITHUB_TOKEN ghcr.io
-          podman push ${{ needs.build-metrics-container.outputs.container-name-tag }}
-          podman push ${{ needs.build-metrics-container.outputs.container-name }}:latest
diff --git a/.github/workflows/ci-post-commit-analyzer-run.py b/.github/workflows/ci-post-commit-analyzer-run.py
deleted file mode 100644
index e5f52d3b2fa6..000000000000
--- a/.github/workflows/ci-post-commit-analyzer-run.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import json
-import multiprocessing
-import os
-import re
-import subprocess
-import sys
-
-
-def run_analyzer(data):
-    os.chdir(data["directory"])
-    command = (
-        data["command"]
-        + f" --analyze --analyzer-output html -o analyzer-results -Xclang -analyzer-config -Xclang max-nodes=75000"
-    )
-    print(command)
-    subprocess.run(command, shell=True, check=True)
-
-
-def pool_error(e):
-    print("Error analyzing file:", e)
-
-
-def main():
-    db_path = sys.argv[1]
-    database = json.load(open(db_path))
-
-    with multiprocessing.Pool() as pool:
-        pool.map_async(run_analyzer, [k for k in database], error_callback=pool_error)
-        pool.close()
-        pool.join()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
deleted file mode 100644
index b8074859d23a..000000000000
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: Post-Commit Static Analyzer
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - 'llvm/**'
-      - '.github/workflows/ci-post-commit-analyzer.yml'
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      - closed
-    paths:
-      - '.github/workflows/ci-post-commit-analyzer.yml'
-      - '.github/workflows/ci-post-commit-analyzer-run.py'
-  schedule:
-    - cron: '30 0 * * *'
-
-concurrency:
-  group: >-
-    llvm-project-${{ github.workflow }}-${{ github.event_name == 'pull_request' &&
-      ( github.event.pull_request.number || github.ref) }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  post-commit-analyzer:
-    if: >-
-      github.repository_owner == 'llvm' &&
-      github.event.action != 'closed'
-    runs-on: ubuntu-24.04
-    container:
-      image: 'ghcr.io/llvm/ci-ubuntu-24.04:latest'
-    env:
-      LLVM_VERSION: 18
-    steps:
-      - name: Checkout Source
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          # A full build of llvm, clang, lld, and lldb takes about 250MB
-          # of ccache space. There's not much reason to have more than this,
-          # because we usually won't need to save cache entries from older
-          # builds.  Also, there is an overall 10GB cache limit, and each
-          # run creates a new cache entry so we want to ensure that we have
-          # enough cache space for all the tests to run at once and still
-          # fit under the 10 GB limit.
-          # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
-          max-size: 2G
-          key: post-commit-analyzer
-          variant: sccache
-
-      - name: Configure
-        run: |
-              cmake -B build -S llvm -G Ninja \
-                  -DLLVM_ENABLE_ASSERTIONS=ON \
-                  -DLLVM_ENABLE_PROJECTS=clang \
-                  -DLLVM_BUILD_LLVM_DYLIB=ON \
-                  -DLLVM_LINK_LLVM_DYLIB=ON \
-                  -DCMAKE_CXX_COMPILER=clang++ \
-                  -DCMAKE_C_COMPILER=clang \
-                  -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-                  -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-                  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-                  -DLLVM_INCLUDE_TESTS=OFF \
-                  -DCLANG_INCLUDE_TESTS=OFF \
-                  -DCMAKE_BUILD_TYPE=Release
-
-      - name: Build
-        run: |
-          # FIXME: We need to build all the generated header files in order to be able to run
-          # the analyzer on every file.  Building libLLVM and libclang is probably overkill for
-          # this, but it's better than building every target.
-          ninja -v -C build libLLVM.so libclang.so
-
-          # Run the analyzer.
-          python3 .github/workflows/ci-post-commit-analyzer-run.py build/compile_commands.json
-
-          scan-build --generate-index-only build/analyzer-results
-
-      - name: Upload Results
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-        if: always()
-        with:
-          name: analyzer-results
-          path: 'build/analyzer-results/*'
-
diff --git a/.github/workflows/commit-access-greeter.yml b/.github/workflows/commit-access-greeter.yml
deleted file mode 100644
index a5fbbbb94e22..000000000000
--- a/.github/workflows/commit-access-greeter.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: Commit Access Greeter
-
-on:
-  issues:
-    types:
-      - labeled
-
-permissions:
-  contents: read
-
-jobs:
-  commit-access-greeter:
-    permissions:
-      issues: write
-      pull-requests: read
-    if: >-
-      github.repository_owner == 'llvm' &&
-      github.event.label.name == 'infra:commit-access-request'
-    runs-on: ubuntu-24.04
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
-        with:
-          sparse-checkout: llvm/utils/git/
-
-      - name: Setup Automation Script
-        working-directory: ./llvm/utils/git/
-        run: |
-          pip install --require-hashes -r requirements.txt
-
-      - name: Add comments to issue
-        working-directory: ./llvm/utils/git/
-        env:
-          LABEL_NAME: ${{ github.event.label.name }}
-          GITHUB_TOKEN: ${{ github.token }}
-          ISSUE_NUMBER: ${{ github.event.issue.number }}
-        run: |
-          python3 ./github-automation.py \
-            --token $GITHUB_TOKEN \
-             commit-request-greeter \
-             --issue-number $ISSUE_NUMBER
diff --git a/.github/workflows/commit-access-review.py b/.github/workflows/commit-access-review.py
deleted file mode 100644
index 4f539fe98004..000000000000
--- a/.github/workflows/commit-access-review.py
+++ /dev/null
@@ -1,402 +0,0 @@
-#!/usr/bin/env python3
-# ===-- commit-access-review.py  --------------------------------------------===#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===------------------------------------------------------------------------===#
-#
-# ===------------------------------------------------------------------------===#
-
-import datetime
-import github
-import re
-import requests
-import time
-import sys
-import re
-
-
-class User:
-    THRESHOLD = 5
-
-    def __init__(self, name, triage_list):
-        self.name = name
-        self.authored = 0
-        self.merged = 0
-        self.reviewed = 0
-        self.triage_list = triage_list
-
-    def add_authored(self, val=1):
-        self.authored += val
-        if self.meets_threshold():
-            print(self.name, "meets the threshold with authored commits")
-            del self.triage_list[self.name]
-
-    def set_authored(self, val):
-        self.authored = 0
-        self.add_authored(val)
-
-    def add_merged(self, val=1):
-        self.merged += val
-        if self.meets_threshold():
-            print(self.name, "meets the threshold with merged commits")
-            del self.triage_list[self.name]
-
-    def add_reviewed(self, val=1):
-        self.reviewed += val
-        if self.meets_threshold():
-            print(self.name, "meets the threshold with reviewed commits")
-            del self.triage_list[self.name]
-
-    def get_total(self):
-        return self.authored + self.merged + self.reviewed
-
-    def meets_threshold(self):
-        return self.get_total() >= self.THRESHOLD
-
-    def __repr__(self):
-        return "{} : a: {} m: {} r: {}".format(
-            self.name, self.authored, self.merged, self.reviewed
-        )
-
-
-def check_manual_requests(
-    gh: github.Github, start_date: datetime.datetime
-) -> list[str]:
-    """
-    Return a list of users who have been asked since ``start_date`` if they
-    want to keep their commit access or if they have applied for commit
-    access since ``start_date``
-    """
-
-    query = """
-        query ($query: String!, $after: String) {
-          search(query: $query, type: ISSUE, first: 100, after: $after) {
-            nodes {
-              ... on Issue {
-                author {
-                  login
-                }
-                body
-              }
-            }
-            pageInfo {
-              hasNextPage
-              endCursor
-            }
-          }
-        }
-        """
-    formatted_start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S")
-    variables = {
-        "query": f"type:issue created:>{formatted_start_date} org:llvm repo:llvm-project label:infra:commit-access,infra:commit-access-request"
-    }
-
-    has_next_page = True
-    users = []
-    while has_next_page:
-        res_header, res_data = gh._Github__requester.graphql_query(
-            query=query, variables=variables
-        )
-        data = res_data["data"]
-        for issue in data["search"]["nodes"]:
-            users.extend([user[1:] for user in re.findall("@[^ ,\n]+", issue["body"])])
-            if issue["author"]:
-                users.append(issue["author"]["login"])
-        has_next_page = data["search"]["pageInfo"]["hasNextPage"]
-        if has_next_page:
-            variables["after"] = data["search"]["pageInfo"]["endCursor"]
-    return users
-
-
-def get_num_commits(gh: github.Github, user: str, start_date: datetime.datetime) -> int:
-    """
-    Get number of commits that ``user`` has been made since ``start_date`.
-    """
-    variables = {
-        "owner": "llvm",
-        "user": user,
-        "start_date": start_date.strftime("%Y-%m-%dT%H:%M:%S"),
-    }
-
-    user_query = """
-        query ($user: String!) {
-          user(login: $user) {
-            id
-          }
-        }
-    """
-
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=user_query, variables=variables
-    )
-    data = res_data["data"]
-    variables["user_id"] = data["user"]["id"]
-
-    query = """
-        query ($owner: String!, $user_id: ID!, $start_date: GitTimestamp!){
-          organization(login: $owner) {
-            teams(query: "llvm-committers" first:1) {
-              nodes {
-                repositories {
-                  nodes {
-                    ref(qualifiedName: "main") {
-                      target {
-                        ... on Commit {
-                          history(since: $start_date, author: {id: $user_id }) {
-                            totalCount
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-     """
-    count = 0
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=query, variables=variables
-    )
-    data = res_data["data"]
-    for repo in data["organization"]["teams"]["nodes"][0]["repositories"]["nodes"]:
-        count += int(repo["ref"]["target"]["history"]["totalCount"])
-        if count >= User.THRESHOLD:
-            break
-    return count
-
-
-def is_new_committer_query_repo(
-    gh: github.Github, user: str, start_date: datetime.datetime
-) -> bool:
-    """
-    Determine if ``user`` is a new committer.  A new committer can keep their
-    commit access even if they don't meet the criteria.
-    """
-    variables = {
-        "user": user,
-    }
-
-    user_query = """
-        query ($user: String!) {
-          user(login: $user) {
-            id
-          }
-        }
-    """
-
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=user_query, variables=variables
-    )
-    data = res_data["data"]
-    variables["owner"] = "llvm"
-    variables["user_id"] = data["user"]["id"]
-    variables["start_date"] = start_date.strftime("%Y-%m-%dT%H:%M:%S")
-
-    query = """
-        query ($owner: String!, $user_id: ID!){
-          organization(login: $owner) {
-            repository(name: "llvm-project") {
-              ref(qualifiedName: "main") {
-                target {
-                  ... on Commit {
-                    history(author: {id: $user_id }, first: 5) {
-                      nodes {
-                        committedDate
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-     """
-
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=query, variables=variables
-    )
-    data = res_data["data"]
-    repo = data["organization"]["repository"]
-    commits = repo["ref"]["target"]["history"]["nodes"]
-    if len(commits) == 0:
-        return True
-    committed_date = commits[-1]["committedDate"]
-    if datetime.datetime.strptime(committed_date, "%Y-%m-%dT%H:%M:%SZ") < start_date:
-        return False
-    return True
-
-
-def is_new_committer(
-    gh: github.Github, user: str, start_date: datetime.datetime
-) -> bool:
-    """
-    Wrapper around is_new_commiter_query_repo to handle exceptions.
-    """
-    try:
-        return is_new_committer_query_repo(gh, user, start_date)
-    except:
-        pass
-    return True
-
-
-def get_review_count(
-    gh: github.Github, user: str, start_date: datetime.datetime
-) -> int:
-    """
-    Return the number of reviews that ``user`` has done since ``start_date``.
-    """
-    query = """
-        query ($query: String!) {
-          search(query: $query, type: ISSUE, first: 5) {
-            issueCount
-          }
-        }
-        """
-    formatted_start_date = start_date.strftime("%Y-%m-%dT%H:%M:%S")
-    variables = {
-        "owner": "llvm",
-        "repo": "llvm-project",
-        "user": user,
-        "query": f"type:pr commenter:{user} -author:{user} merged:>{formatted_start_date} org:llvm",
-    }
-
-    res_header, res_data = gh._Github__requester.graphql_query(
-        query=query, variables=variables
-    )
-    data = res_data["data"]
-    return int(data["search"]["issueCount"])
-
-
-def count_prs(gh: github.Github, triage_list: dict, start_date: datetime.datetime):
-    """
-    Fetch all the merged PRs for the project since ``start_date`` and update
-    ``triage_list`` with the number of PRs merged for each user.
-    """
-
-    query = """
-        query ($query: String!, $after: String) {
-          search(query: $query, type: ISSUE, first: 100, after: $after) {
-            issueCount,
-            nodes {
-              ... on PullRequest {
-                 author {
-                   login
-                 }
-                 mergedBy {
-                   login
-                 }
-              }
-            }
-            pageInfo {
-              hasNextPage
-              endCursor
-            }
-          }
-        }
-    """
-    date_begin = start_date
-    date_end = None
-    while date_begin < datetime.datetime.now():
-        date_end = date_begin + datetime.timedelta(days=7)
-        formatted_date_begin = date_begin.strftime("%Y-%m-%dT%H:%M:%S")
-        formatted_date_end = date_end.strftime("%Y-%m-%dT%H:%M:%S")
-        variables = {
-            "query": f"type:pr is:merged merged:{formatted_date_begin}..{formatted_date_end} org:llvm",
-        }
-        has_next_page = True
-        while has_next_page:
-            print(variables)
-            res_header, res_data = gh._Github__requester.graphql_query(
-                query=query, variables=variables
-            )
-            data = res_data["data"]
-            for pr in data["search"]["nodes"]:
-                # Users can be None if the user has been deleted.
-                if not pr["author"]:
-                    continue
-                author = pr["author"]["login"]
-                if author in triage_list:
-                    triage_list[author].add_authored()
-
-                if not pr["mergedBy"]:
-                    continue
-                merger = pr["mergedBy"]["login"]
-                if author == merger:
-                    continue
-                if merger not in triage_list:
-                    continue
-                triage_list[merger].add_merged()
-
-            has_next_page = data["search"]["pageInfo"]["hasNextPage"]
-            if has_next_page:
-                variables["after"] = data["search"]["pageInfo"]["endCursor"]
-        date_begin = date_end
-
-
-def main():
-    token = sys.argv[1]
-    gh = github.Github(login_or_token=token)
-    org = gh.get_organization("llvm")
-    repo = org.get_repo("llvm-project")
-    one_year_ago = datetime.datetime.now() - datetime.timedelta(days=365)
-    triage_list = {}
-    for collaborator in repo.get_collaborators(permission="push"):
-        triage_list[collaborator.login] = User(collaborator.login, triage_list)
-
-    print("Start:", len(triage_list), "triagers")
-    # Step 0 Check if users have requested commit access in the last year.
-    for user in check_manual_requests(gh, one_year_ago):
-        if user in triage_list:
-            print(user, "requested commit access in the last year.")
-            del triage_list[user]
-    print("After Request Check:", len(triage_list), "triagers")
-
-    # Step 1 count all PRs authored or merged
-    count_prs(gh, triage_list, one_year_ago)
-
-    print("After PRs:", len(triage_list), "triagers")
-
-    if len(triage_list) == 0:
-        sys.exit(0)
-
-    # Step 2 check for reviews
-    for user in list(triage_list.keys()):
-        review_count = get_review_count(gh, user, one_year_ago)
-        triage_list[user].add_reviewed(review_count)
-
-    print("After Reviews:", len(triage_list), "triagers")
-
-    if len(triage_list) == 0:
-        sys.exit(0)
-
-    # Step 3 check for number of commits
-    for user in list(triage_list.keys()):
-        num_commits = get_num_commits(gh, user, one_year_ago)
-        # Override the total number of commits to not double count commits and
-        # authored PRs.
-        triage_list[user].set_authored(num_commits)
-
-    print("After Commits:", len(triage_list), "triagers")
-
-    # Step 4 check for new committers
-    for user in list(triage_list.keys()):
-        print("Checking", user)
-        if is_new_committer(gh, user, one_year_ago):
-            print("Removing new committer: ", user)
-            del triage_list[user]
-
-    print("Complete:", len(triage_list), "triagers")
-
-    with open("triagers.log", "w") as triagers_log:
-        for user in triage_list:
-            print(triage_list[user].__repr__())
-            triagers_log.write(user + "\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
deleted file mode 100644
index d401a137737c..000000000000
--- a/.github/workflows/commit-access-review.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: Commit Access Review
-
-on:
-  workflow_dispatch:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron:  '0 7 1 * *'
-
-permissions:
-  contents: read
-
-jobs:
-  commit-access-review:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      
-      - name: Install dependencies
-        run: |
-          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-      
-      - name: Run Script
-        env:
-          GITHUB_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-        run: |
-          python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
-
-      - name: Upload Triage List
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-        with:
-          name: triagers
-          path: triagers.log
diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
deleted file mode 100644
index b909d14b4eee..000000000000
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ /dev/null
@@ -1,100 +0,0 @@
-# Agent image for LLVM org cluster.
-# .net 4.8 is required by chocolately package manager.
-FROM mcr.microsoft.com/dotnet/framework/sdk:4.8-windowsservercore-ltsc2022
-
-# Restore the default Windows shell for correct batch processing.
-SHELL ["cmd", "/S", "/C"]
-
-# Download the Build Tools bootstrapper.
-ADD https://aka.ms/vs/16/release/vs_buildtools.exe /TEMP/vs_buildtools.exe
-
-RUN powershell -Command Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
-
-# Download channel for fixed install.
-ARG CHANNEL_URL=https://aka.ms/vs/16/release/channel
-ADD ${CHANNEL_URL} /TEMP/VisualStudio.chman
-
-# Install Build Tools with C++ workload.
-#   - Documentation for docker installation
-#     https://docs.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2019
-#   - Documentation on workloads
-#     https://docs.microsoft.com/en-us/visualstudio/install/workload-component-id-vs-build-tools?view=vs-2019#c-build-tools
-#   - Documentation on flags
-#     https://docs.microsoft.com/en-us/visualstudio/install/use-command-line-parameters-to-install-visual-studio?view=vs-2019
-RUN /TEMP/vs_buildtools.exe --quiet --wait --norestart --nocache \
-    --channelUri C:\TEMP\VisualStudio.chman \
-    --installChannelUri C:\TEMP\VisualStudio.chman \
-    --installPath C:\BuildTools \
-    --add Microsoft.VisualStudio.Workload.VCTools \
-    --add Microsoft.VisualStudio.Component.VC.ATL \
-    --includeRecommended \
-    || IF "%ERRORLEVEL%"=="3010" EXIT 0
-
-# Register DIA dll (Debug Interface Access) so it can be used to symbolize
-# the stack traces. Register dll for 32 and 64 bit.
-# see https://developercommunity.visualstudio.com/content/problem/290674/msdia140dll-is-not-registered-on-vs2017-hosts.html
-
-RUN regsvr32 /S "C:\BuildTools\DIA SDK\bin\amd64\msdia140.dll" & \
-    regsvr32 /S "C:\BuildTools\DIA SDK\bin\msdia140.dll"
-
-# install tools as described in https://llvm.org/docs/GettingStartedVS.html
-# and a few more that were not documented...
-# Pin an older version of Python; the current Python 3.10 fails when
-# doing "pip install" for the other dependencies, as it fails to find libxml
-# while compiling some package.
-# We version pin the other packages as well to ensure the container build is as
-# reproducible as possible to prevent issues when upgrading only part of the
-# container.
-RUN choco install -y ninja --version 1.13.1 && \
-    choco install -y git --version 2.50.1 && \
-    choco install -y sccache --version 0.10.0 && \
-    choco install -y python3 --version 3.9.7
-
-# Testing requires psutil
-RUN pip install psutil
-
-# configure Python encoding
-ENV PYTHONIOENCODING=UTF-8
-
-# update the path variable
-# C:\Program Files\Git\usr\bin contains a usable bash and other unix tools.
-# C:\llvm-mingw\bin contains Clang configured for mingw targets and
-#     corresponding sysroots. Both the 'llvm' package (with Clang defaulting
-#     to MSVC targets) and this directory contains executables named
-#     'clang.exe' - add this last to let the other one have precedence.
-#     To use these compilers, use the triple prefixed form, e.g.
-#     x86_64-w64-mingw32-clang.
-# C:\buildtools and SDK paths are ones that are set by c:\BuildTools\Common7\Tools\VsDevCmd.bat -arch=amd64 -host_arch=amd64
-RUN powershell -Command \
-    [System.Environment]::SetEnvironmentVariable('PATH', \
-    [System.Environment]::GetEnvironmentVariable('PATH', 'machine') + ';C:\Program Files\Git\usr\bin;C:\llvm-mingw\bin' \
-    + ';C:\BuildTools\Common7\IDE\' \
-    + ';C:\BuildTools\Common7\IDE\CommonExt ensions\Microsoft\TeamFoundation\Team Explorer' \
-    + ';C:\BuildTools\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin' \
-    + ';C:\BuildTools\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja' \
-    + ';C:\BuildTools\Common7\IDE\CommonExtensions\Microsoft\TeamFoundation\Team Explorer' \
-    + ';C:\BuildTools\Common7\IDE\CommonExtensions\Microsoft\TestWindow' \
-    + ';C:\BuildTools\Common7\IDE\VC\VCPackages' \
-    + ';C:\BuildTools\Common7\Tools\' \
-    + ';C:\BuildTools\Common7\Tools\devinit' \
-    + ';C:\BuildTools\MSBuild\Current\Bin' \
-    + ';C:\BuildTools\MSBuild\Current\bin\Roslyn' \
-    + ';C:\BuildTools\VC\Tools\MSVC\14.29.30133\bin\HostX64\x64' \
-    + ';C:\Program Files (x86)\Microsoft SDKs\Windows\v10.0A\bin\NETFX 4.8 Tools\x64\' \
-    + ';C:\Program Files (x86)\Windows Kits\10\bin\10.0.19041.0\x64' \
-    + ';C:\Program Files (x86)\Windows Kits\10\bin\x64' \
-    + ';C:\Windows\Microsoft.NET\Framework64\v4.0.30319' \
-    ,'machine')
-
-# support long file names during git checkout
-RUN git config --system core.longpaths true & \
-    git config --global core.autocrlf false
-
-ARG RUNNER_VERSION=2.326.0
-ENV RUNNER_VERSION=$RUNNER_VERSION
-
-RUN powershell -Command \
-    Invoke-WebRequest -Uri https://github.com/actions/runner/releases/download/v${env:RUNNER_VERSION}/actions-runner-win-x64-${env:RUNNER_VERSION}.zip -OutFile actions-runner-win.zip ; \
-    Add-Type -AssemblyName System.IO.Compression.FileSystem ; \
-    [System.IO.Compression.ZipFile]::ExtractToDirectory('actions-runner-win.zip', $PWD) ;\
-    rm actions-runner-win.zip
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
deleted file mode 100644
index efe08ebc221c..000000000000
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ /dev/null
@@ -1,96 +0,0 @@
-FROM docker.io/library/ubuntu:24.04 as base
-ENV LLVM_SYSROOT=/opt/llvm
-
-FROM base as stage1-toolchain
-ENV LLVM_VERSION=20.1.8
-
-RUN apt-get update && \
-    apt-get install -y \
-    wget \
-    gcc \
-    g++ \
-    cmake \
-    ninja-build \
-    python3 \
-    git \
-    curl \
-    zlib1g-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -O -L https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-$LLVM_VERSION.tar.gz && \
-  tar -xf llvmorg-$LLVM_VERSION.tar.gz && \
-  rm -f llvmorg-$LLVM_VERSION.tar.gz
-
-WORKDIR /llvm-project-llvmorg-$LLVM_VERSION
-
-RUN cmake -B ./build -G Ninja ./llvm \
-  -C ./clang/cmake/caches/BOLT-PGO.cmake \
-  -DBOOTSTRAP_LLVM_ENABLE_LLD=ON \
-  -DBOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_LLD=ON \
-  -DPGO_INSTRUMENT_LTO=Thin \
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt" \
-  -DCMAKE_INSTALL_PREFIX="$LLVM_SYSROOT" \
-  -DLLVM_ENABLE_PROJECTS="bolt;clang;lld;clang-tools-extra" \
-  -DLLVM_DISTRIBUTION_COMPONENTS="lld;compiler-rt;clang-format;scan-build;llvm-symbolizer" \
-  -DCLANG_DEFAULT_LINKER="lld"
-
-RUN ninja -C ./build stage2-clang-bolt stage2-install-distribution && ninja -C ./build install-distribution
-
-FROM base as ci-container
-
-COPY --from=stage1-toolchain $LLVM_SYSROOT $LLVM_SYSROOT
-
-# Need to install curl for hendrikmuhs/ccache-action
-# Need nodejs for some of the GitHub actions.
-# Need perl-modules for clang analyzer tests.
-# Need git for SPIRV-Tools tests.
-RUN apt-get update && \
-    DEBIAN_FRONTEND=noninteractive apt-get install -y \
-    binutils \
-    cmake \
-    curl \
-    git \
-    libstdc++-11-dev \
-    ninja-build \
-    nodejs \
-    perl-modules \
-    python3-psutil \
-    sudo \
-    # These are needed by the premerge pipeline. Pip is used to install
-    # dependent python packages and ccache is used for build caching. File and
-    # tzdata are used for tests.
-    python3-pip \
-    ccache \
-    file \
-    tzdata \
-    sccache && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ENV LLVM_SYSROOT=$LLVM_SYSROOT
-ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
-
-# Create a new user to avoid test failures related to a lack of expected
-# permissions issues in some tests. Set the user id to 1001 as that is the
-# user id that Github Actions uses to perform the checkout action.
-RUN useradd gha -u 1001 -m -s /bin/bash
-
-# Also add the user to passwordless sudoers so that we can install software
-# later on without having to rebuild the container.
-RUN adduser gha sudo
-RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
-
-USER gha
-WORKDIR /home/gha
-
-FROM ci-container as ci-container-agent
-
-ENV GITHUB_RUNNER_VERSION=2.326.0
-
-RUN mkdir actions-runner && \
-    cd actions-runner && \
-    curl -O -L https://github.com/actions/runner/releases/download/v$GITHUB_RUNNER_VERSION/actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
-    tar xzf ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz && \
-    rm ./actions-runner-linux-x64-$GITHUB_RUNNER_VERSION.tar.gz
-
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
deleted file mode 100644
index 3970271e4adb..000000000000
--- a/.github/workflows/docs.yml
+++ /dev/null
@@ -1,215 +0,0 @@
-# LLVM Documentation CI
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-name: "Test documentation build"
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - 'main'
-    paths:
-      - 'llvm/docs/**'
-      - 'clang/docs/**'
-      - 'clang/include/clang/Basic/AttrDocs.td'
-      - 'clang/include/clang/Driver/ClangOptionDocs.td'
-      - 'clang/include/clang/Basic/DiagnosticDocs.td'
-      - 'clang-tools-extra/docs/**'
-      - 'lldb/docs/**'
-      - 'libunwind/docs/**'
-      - 'libcxx/docs/**'
-      - 'libc/docs/**'
-      - 'lld/docs/**'
-      - 'openmp/docs/**'
-      - 'polly/docs/**'
-      - 'flang/docs/**'
-      - 'flang/include/flang/Optimizer/Dialect/FIROps.td'
-      - '.github/workflows/docs.yml'
-  pull_request:
-    paths:
-      - 'llvm/docs/**'
-      - 'clang/docs/**'
-      - 'clang/include/clang/Basic/AttrDocs.td'
-      - 'clang/include/clang/Driver/ClangOptionDocs.td'
-      - 'clang/include/clang/Basic/DiagnosticDocs.td'
-      - 'clang-tools-extra/docs/**'
-      - 'lldb/docs/**'
-      - 'libunwind/docs/**'
-      - 'libcxx/docs/**'
-      - 'libc/docs/**'
-      - 'lld/docs/**'
-      - 'openmp/docs/**'
-      - 'polly/docs/**'
-      - 'flang/docs/**'
-      - 'flang/include/flang/Optimizer/Dialect/FIROps.td'
-      - '.github/workflows/docs.yml'
-
-jobs:
-  check-docs-build:
-    name: "Test documentation build"
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 2
-      - name: Get subprojects that have doc changes
-        id: docs-changed-subprojects
-        uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
-        with:
-          skip_initial_fetch: true
-          base_sha: 'HEAD~1'
-          sha: 'HEAD'
-          files_yaml: |
-            llvm:
-              - 'llvm/docs/**'
-            clang:
-              - 'clang/docs/**'
-              - 'clang/include/clang/Basic/AttrDocs.td'
-              - 'clang/include/clang/Driver/ClangOptionDocs.td'
-              - 'clang/include/clang/Basic/DiagnosticDocs.td'
-            clang-tools-extra:
-              - 'clang-tools-extra/docs/**'
-            lldb:
-              - 'lldb/docs/**'
-            libunwind:
-              - 'libunwind/docs/**'
-            libcxx:
-              - 'libcxx/docs/**'
-            libc:
-              - 'libc/docs/**'
-            lld:
-              - 'lld/docs/**'
-            openmp:
-              - 'openmp/docs/**'
-            polly:
-              - 'polly/docs/**'
-            flang:
-              - 'flang/docs/**'
-              - 'flang/include/flang/Optimizer/Dialect/FIROps.td'
-            workflow:
-              - '.github/workflows/docs.yml'
-      - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          python-version: '3.11'
-          cache: 'pip'
-          cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
-      - name: Install python dependencies
-        run: pip install -r llvm/docs/requirements-hashed.txt
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          # swig and graphviz are lldb specific dependencies
-          sudo apt-get install -y cmake ninja-build swig graphviz
-      - name: Setup output folder
-        run: mkdir built-docs
-      - name: Build LLVM docs
-        if: |
-          steps.docs-changed-subprojects.outputs.llvm_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B llvm-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C llvm-build docs-llvm-html docs-llvm-man
-          mkdir built-docs/llvm
-          cp -r llvm-build/docs/* built-docs/llvm/
-      - name: Build Clang docs
-        if: |
-          steps.docs-changed-subprojects.outputs.clang_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B clang-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C clang-build docs-clang-html docs-clang-man
-          mkdir built-docs/clang
-          cp -r clang-build/docs/* built-docs/clang/
-      - name: Build clang-tools-extra docs
-        if: |
-          steps.docs-changed-subprojects.outputs.clang-tools-extra_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B clang-tools-extra-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C clang-tools-extra-build docs-clang-tools-html docs-clang-tools-man
-          mkdir built-docs/clang-tools-extra
-          cp -r clang-tools-extra-build/docs/* built-docs/clang-tools-extra/
-      - name: Build LLDB docs
-        if: |
-          steps.docs-changed-subprojects.outputs.lldb_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B lldb-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;lldb" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C lldb-build docs-lldb-html docs-lldb-man
-          mkdir built-docs/lldb
-          cp -r lldb-build/docs/* built-docs/lldb/
-      - name: Build libunwind docs
-        if: |
-          steps.docs-changed-subprojects.outputs.libunwind_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B libunwind-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libunwind" -DLLVM_ENABLE_SPHINX=ON ./runtimes
-          TZ=UTC ninja -C libunwind-build docs-libunwind-html
-          mkdir built-docs/libunwind
-          cp -r libunwind-build/libunwind/docs/* built-docs/libunwind
-      - name: Build libcxx docs
-        if: |
-          steps.docs-changed-subprojects.outputs.libcxx_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B libcxx-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libcxxabi;libcxx;libunwind" -DLLVM_ENABLE_SPHINX=ON ./runtimes
-          TZ=UTC ninja -C libcxx-build docs-libcxx-html
-          mkdir built-docs/libcxx
-          cp -r libcxx-build/libcxx/docs/* built-docs/libcxx/
-      - name: Build libc docs
-        if: |
-          steps.docs-changed-subprojects.outputs.libc_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B libc-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libc" -DLLVM_ENABLE_SPHINX=ON ./runtimes
-          TZ=UTC ninja -C libc-build docs-libc-html
-          mkdir built-docs/libc
-          cp -r libc-build/libc/docs/* built-docs/libc/
-      - name: Build LLD docs
-        if: |
-          steps.docs-changed-subprojects.outputs.lld_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B lld-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="lld" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C lld-build docs-lld-html
-          mkdir built-docs/lld
-          cp -r lld-build/docs/* built-docs/lld/
-      - name: Build OpenMP docs
-        if: |
-          steps.docs-changed-subprojects.outputs.openmp_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B openmp-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C openmp-build docs-openmp-html
-          mkdir built-docs/openmp
-          cp -r openmp-build/docs/* built-docs/openmp/
-      - name: Build Polly docs
-        if: |
-          steps.docs-changed-subprojects.outputs.polly_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B polly-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="polly" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C polly-build docs-polly-html docs-polly-man
-          mkdir built-docs/polly
-          cp -r polly-build/docs/* built-docs/polly/
-      - name: Build Flang docs
-        if: |
-          steps.docs-changed-subprojects.outputs.flang_any_changed == 'true' ||
-          steps.docs-changed-subprojects.outputs.workflow_any_changed == 'true'
-        run: |
-          cmake -B flang-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" -DLLVM_ENABLE_SPHINX=ON ./llvm
-          TZ=UTC ninja -C flang-build docs-flang-html docs-flang-man
-          mkdir built-docs/flang
-          cp -r flang-build/docs/* built-docs/flang/
-      - name: Upload docs
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: docs-output
-          path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
deleted file mode 100644
index 904ad718f97d..000000000000
--- a/.github/workflows/email-check.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-name: "Check for private emails used in PRs"
-
-on:
-  pull_request:
-    types:
-      - opened
-
-permissions:
-  contents: read
-
-jobs:
-  validate_email:
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-
-      - name: Extract author email
-        id: author
-        run: |
-          git log -1
-          echo "EMAIL=$(git show -s --format='%ae' HEAD~0)" >> $GITHUB_OUTPUT
-          # Create empty comment file
-          echo "[]" > comments
-
-      - name: Validate author email
-        if: ${{ endsWith(steps.author.outputs.EMAIL, 'noreply.github.com')  }}
-        env:
-          COMMENT: >-
-            ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.<br/>
-            Please turn off [Keep my email addresses private](https://github.com/settings/emails) setting in your account.<br/>
-            See [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and
-            [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
-        run: |
-          cat << EOF > comments
-          [{"body" : "$COMMENT"}]
-          EOF
-
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-        if: always()
-        with:
-          name: workflow-args
-          path: |
-            comments
diff --git a/.github/workflows/get-llvm-version/action.yml b/.github/workflows/get-llvm-version/action.yml
deleted file mode 100644
index 2218d926fc13..000000000000
--- a/.github/workflows/get-llvm-version/action.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: Get LLVM Version
-description: >-
-  Get the LLVM version from the llvm-project source tree.  This action assumes
-  the llvm-project sources have already been checked out into GITHUB_WORKSPACE.
-
-outputs:
-  major:
-    description: LLVM major version
-    value: ${{ steps.version.outputs.major }}
-  minor:
-    description: LLVM minor version
-    value: ${{ steps.version.outputs.minor }}
-  patch:
-    description: LLVM patch version
-    value: ${{ steps.version.outputs.patch }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Get Version
-      shell: bash
-      id: version
-      run: |
-        for v in major minor patch; do
-          echo "$v=`llvm/utils/release/get-llvm-version.sh --$v`" >> $GITHUB_OUTPUT
-        done
diff --git a/.github/workflows/hlsl-matrix.yaml b/.github/workflows/hlsl-matrix.yaml
deleted file mode 100644
index c63a32acd2b3..000000000000
--- a/.github/workflows/hlsl-matrix.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: HLSL Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  pull_request:
-    branches:
-      - main
-    paths:
-      - llvm/**/DirectX/**
-      - .github/workflows/hlsl*
-      - clang/*HLSL*/**/*
-      - clang/**/*HLSL*
-      - llvm/**/Frontend/HLSL/**/*
-
-jobs:
-  HLSL-Tests:
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on:
-          - hlsl-macos
-
-    uses: ./.github/workflows/hlsl-test-all.yaml
-    with:
-      SKU: hlsl-macos
-      TestTarget: check-hlsl-clang-mtl # TODO: This target changes based on SKU
-      LLVM-ref: ${{ github.ref }}
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
deleted file mode 100644
index b6530fe11b84..000000000000
--- a/.github/workflows/hlsl-test-all.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-name: HLSL Test
-
-permissions:
-  contents: read
-
-on:
-  workflow_call:
-    inputs:
-      OffloadTest-branch:
-        description: 'Test Suite Branch'
-        required: false
-        default: 'main'
-        type: string
-      LLVM-ref:
-        description: 'LLVM Branch'
-        required: false
-        default: 'main'
-        type: string
-      SKU:
-        required: true
-        type: string
-      TestTarget:
-        required: false
-        default: 'check-hlsl'
-        type: string
-
-jobs:
-  build:
-    runs-on: ${{ inputs.SKU }}
-    steps:
-      - name: Checkout DXC
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          repository: Microsoft/DirectXShaderCompiler
-          ref: main
-          path: DXC
-          submodules: true
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          ref: ${{ inputs.LLVM-branch }}
-          path: llvm-project
-      - name: Checkout OffloadTest
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          repository: llvm/offload-test-suite
-          ref: main
-          path: OffloadTest
-      - name: Checkout Golden Images
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          repository: llvm/offload-golden-images
-          ref: main
-          path: golden-images
-      - name: Setup Windows
-        if: runner.os == 'Windows'
-        uses: llvm/actions/setup-windows@main
-        with:
-          arch: amd64
-      - name: Build DXC
-        run: |
-            cd DXC
-            mkdir build
-            cd build
-            cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -C ${{ github.workspace }}/DXC/cmake/caches/PredefinedParams.cmake -C ${{ github.workspace }}/OffloadTest/cmake/caches/sccache.cmake -DHLSL_DISABLE_SOURCE_GENERATION=On ${{ github.workspace }}/DXC/
-            ninja dxv llvm-dis
-      - name: Build LLVM
-        run: |
-            cd llvm-project
-            mkdir build
-            cd build
-            cmake -G Ninja -DDXIL_DIS=${{ github.workspace }}/DXC/build/bin/llvm-dis -DLLVM_INCLUDE_DXIL_TESTS=On -DCMAKE_BUILD_TYPE=Release -C ${{ github.workspace }}/llvm-project/clang/cmake/caches/HLSL.cmake -C ${{ github.workspace }}/OffloadTest/cmake/caches/sccache.cmake -DDXC_DIR=${{ github.workspace }}/DXC/build/bin -DLLVM_EXTERNAL_OFFLOADTEST_SOURCE_DIR=${{ github.workspace }}/OffloadTest -DLLVM_EXTERNAL_PROJECTS="OffloadTest" -DLLVM_LIT_ARGS="--xunit-xml-output=testresults.xunit.xml -v" -DGOLDENIMAGE_DIR=${{ github.workspace }}/golden-images ${{ github.workspace }}/llvm-project/llvm/
-            ninja hlsl-test-depends llvm-test-depends clang-test-depends
-      - name: Run HLSL Tests
-        run: |
-            cd llvm-project
-            cd build
-            ninja check-llvm
-            ninja check-clang
-            ninja check-hlsl-unit
-            ninja ${{ inputs.TestTarget }}
-      - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@170bf24d20d201b842d7a52403b73ed297e6645b # v2
-        if: always() && runner.os == 'macOS'
-        with:
-          comment_mode: off
-          files: llvm-project/build/**/testresults.xunit.xml
diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
deleted file mode 100644
index efd045990d01..000000000000
--- a/.github/workflows/issue-release-workflow.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# This contains the workflow definitions that allow users to test backports
-# to the release branch using comments on issues.
-#
-# /cherry-pick <commit> <...>
-#
-# This comment will attempt to cherry-pick the given commits to the latest
-# release branch (release/Y.x) and if successful, push the result to a branch
-# on github.
-#
-# /branch <owner>/<repo>/<branch>
-#
-# This comment will create a pull request from <branch> to the latest release
-# branch.
-
-name: Issue Release Workflow
-
-permissions:
-  contents: read
-
-on:
-  issue_comment:
-    types:
-      - created
-      - edited
-  issues:
-    types:
-      - opened
-
-env:
-  COMMENT_BODY: ${{ github.event.action == 'opened' && github.event.issue.body || github.event.comment.body  }}
-
-jobs:
-  backport-commits:
-    name: Backport Commits
-    runs-on: ubuntu-24.04
-    permissions:
-      issues: write
-      pull-requests: write
-    if: >-
-      (github.repository == 'llvm/llvm-project') &&
-      !startswith(github.event.comment.body, '<!--IGNORE-->') &&
-      contains(github.event.action == 'opened' && github.event.issue.body || github.event.comment.body, '/cherry-pick')
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: llvm/llvm-project
-          # GitHub stores the token used for checkout and uses it for pushes
-          # too, but we want to use a different token for pushing, so we need
-          # to disable persist-credentials here.
-          persist-credentials: false
-          fetch-depth: 0
-
-      - name: Setup Environment
-        run: |
-          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-          ./llvm/utils/git/github-automation.py --token ${{ github.token }} setup-llvmbot-git
-
-      - name: Backport Commits
-        run: |
-          printf "%s" "$COMMENT_BODY" |
-          ./llvm/utils/git/github-automation.py \
-          --repo "$GITHUB_REPOSITORY" \
-          --token "${{ secrets.RELEASE_WORKFLOW_PR_CREATE }}" \
-          release-workflow \
-          --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
-          --issue-number ${{ github.event.issue.number }} \
-          --requested-by ${{ (github.event.action == 'opened' && github.event.issue.user.login) || github.event.comment.user.login }} \
-          auto
diff --git a/.github/workflows/issue-subscriber.yml b/.github/workflows/issue-subscriber.yml
deleted file mode 100644
index de1c45c94496..000000000000
--- a/.github/workflows/issue-subscriber.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-name: Issue Subscriber
-
-on:
-  issues:
-    types:
-      - labeled
-
-permissions:
-  contents: read
-
-jobs:
-  auto-subscribe:
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: llvm/utils/git/
-          ref: main
-
-      - name: Setup Automation Script
-        working-directory: ./llvm/utils/git/
-        run: |
-          pip install --require-hashes -r requirements.txt
-
-      - name: Update watchers
-        working-directory: ./llvm/utils/git/
-        # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
-        env:
-          LABEL_NAME: ${{ github.event.label.name }}
-        run: |
-          python3 ./github-automation.py \
-            --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
-            issue-subscriber \
-            --issue-number '${{ github.event.issue.number }}' \
-            --label-name "$LABEL_NAME"
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
deleted file mode 100644
index a2c4f58d6feb..000000000000
--- a/.github/workflows/issue-write.yml
+++ /dev/null
@@ -1,157 +0,0 @@
-name: Comment on an issue
-
-on:
-  workflow_run:
-    workflows:
-      - "Check code formatting"
-      - "Check for private emails used in PRs"
-      - "PR Request Release Note"
-    types:
-      - completed
-
-permissions:
-  contents: read
-
-jobs:
-  pr-comment:
-    runs-on: ubuntu-24.04
-    permissions:
-      pull-requests: write
-    if: >
-      github.event.workflow_run.event == 'pull_request' &&
-      (
-        github.event.workflow_run.conclusion == 'success' ||
-        github.event.workflow_run.conclusion == 'failure'
-      )
-    steps:
-      - name: Fetch Sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: |
-            .github/workflows/unprivileged-download-artifact/action.yml
-          sparse-checkout-cone-mode: false
-      - name: 'Download artifact'
-        uses: ./.github/workflows/unprivileged-download-artifact
-        id: download-artifact
-        with:
-          run-id: ${{ github.event.workflow_run.id }}
-          artifact-name: workflow-args
-
-      - name: 'Comment on PR'
-        if: steps.download-artifact.outputs.artifact-id != ''
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            var fs = require('fs');
-            const comments = JSON.parse(fs.readFileSync('./comments'));
-            if (!comments || comments.length == 0) {
-              return;
-            }
-
-            let runInfo = await github.rest.actions.getWorkflowRun({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              run_id: context.payload.workflow_run.id
-            });
-
-            console.log(runInfo);
-
-
-            // Query to find the number of the pull request that triggered this job.
-            // The associated pull requests are based off of the branch name, so if
-            // you create a pull request for a branch, close it, and then create
-            // another pull request with the same branch, then this query will return
-            // two associated pull requests.  This is why we have to fetch all the
-            // associated pull requests and then iterate through them to find the
-            // one that is open.
-            const gql_query = `
-              query($repo_owner : String!, $repo_name : String!, $branch: String!) {
-                repository(owner: $repo_owner, name: $repo_name) {
-                  ref (qualifiedName: $branch) {
-                    associatedPullRequests(first: 100) {
-                      nodes {
-                        baseRepository {
-                          owner {
-                            login
-                          }
-                        }
-                        number
-                        state
-                      }
-                    }
-                  }
-                }
-              }
-            `
-            const gql_variables = {
-              repo_owner: runInfo.data.head_repository.owner.login,
-              repo_name: runInfo.data.head_repository.name,
-              branch: runInfo.data.head_branch
-            }
-            const gql_result = await github.graphql(gql_query, gql_variables);
-            console.log(gql_result);
-            // If the branch for the PR was deleted before this job has a chance
-            // to run, then the ref will be null.  This can happen if someone:
-            // 1. Rebase the PR, which triggers some workflow.
-            // 2. Immediately merges the PR and deletes the branch.
-            // 3. The workflow finishes and triggers this job.
-            if (!gql_result.repository.ref) {
-              console.log("Ref has been deleted");
-              return;
-            }
-            console.log(gql_result.repository.ref.associatedPullRequests.nodes);
-
-            var pr_number = 0;
-            gql_result.repository.ref.associatedPullRequests.nodes.forEach((pr) => {
-
-              // The largest PR number is the one we care about.  The only way
-              // to have more than one associated pull requests is if all the
-              // old pull requests are in the closed state.
-              if (pr.baseRepository.owner.login = context.repo.owner && pr.number > pr_number) {
-                pr_number = pr.number;
-              }
-            });
-            if (pr_number == 0) {
-              console.log("Error retrieving pull request number");
-              return;
-            }
-            
-            await comments.forEach(function (comment) {
-              if (comment.id) {
-                // Security check: Ensure that this comment was created by
-                // the github-actions bot, so a malicious input won't overwrite
-                // a user's comment.
-                github.rest.issues.getComment({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  comment_id: comment.id
-                }).then((old_comment) => {
-                  console.log(old_comment);
-                  if (old_comment.data.user.login != "github-actions[bot]") {
-                    console.log("Invalid comment id: " + comment.id);
-                    return;
-                  }
-                  github.rest.issues.updateComment({
-                    owner: context.repo.owner,
-                    repo: context.repo.repo,
-                    issue_number: pr_number,
-                    comment_id: comment.id,
-                    body: comment.body
-                  });
-                });
-              } else {
-                github.rest.issues.createComment({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  issue_number: pr_number,
-                  body: comment.body
-                });
-              }
-            });
-
-      - name: Dump comments file
-        if: >-
-          always() &&
-          steps.download-artifact.outputs.artifact-id != ''
-        run: cat comments
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
deleted file mode 100644
index 9b2d8dd57970..000000000000
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ /dev/null
@@ -1,139 +0,0 @@
-# This workflow is for pre-commit testing of the LLVM-libc project.
-name: LLVM-libc Pre-commit Fullbuild Tests
-permissions:
-  contents: read
-on:
-  pull_request:
-    branches: [ "main" ]
-    paths:
-      - 'libc/**'
-      - '.github/workflows/libc-fullbuild-tests.yml'
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        # Build basic linux configuration with Debug/Release/MinSizeRel and all
-        # other configurations in Debug only.
-        include:
-          - os: ubuntu-24.04
-            build_type: Debug
-            ccache-variant: sccache
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
-            target: x86_64-unknown-linux-llvm
-            include_scudo: ON
-          - os: ubuntu-24.04
-            build_type: Release
-            ccache-variant: sccache
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
-            target: x86_64-unknown-linux-llvm
-            include_scudo: ON
-          - os: ubuntu-24.04
-            build_type: MinSizeRel
-            ccache-variant: sccache
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
-            target: x86_64-unknown-linux-llvm
-            include_scudo: ON
-          # TODO: remove ccache logic when https://github.com/hendrikmuhs/ccache-action/issues/279 is resolved.
-          - os: ubuntu-24.04-arm
-            build_type: Debug
-            ccache-variant: ccache
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
-            target: aarch64-unknown-linux-llvm
-            include_scudo: ON
-          - os: ubuntu-24.04
-            build_type: Debug
-            ccache-variant: ccache
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
-            target: x86_64-unknown-uefi-llvm
-            include_scudo: OFF
-          # TODO: add back gcc build when it is fixed
-          # - c_compiler: gcc
-          #   cpp_compiler: g++
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    
-    # Libc's build is relatively small comparing with other components of LLVM.
-    # A fresh fullbuild takes about 190MiB of uncompressed disk space, which can
-    # be compressed into ~40MiB. Limiting the cache size to 1G should be enough.
-    # Prefer sccache as it is more modern.
-    # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
-    # frequent small object writes.
-    - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-      with:
-        max-size: 1G
-        key: libc_fullbuild_${{ matrix.c_compiler }}
-        variant: ${{ matrix.ccache-variant }}
-    
-    # Notice:
-    # - MPFR is required by some of the mathlib tests.
-    # - Debian has a multilib setup, so we need to symlink the asm directory.
-    #   For more information, see https://wiki.debian.org/Multiarch/LibraryPathOverview
-    - name: Prepare dependencies (Ubuntu)
-      run: |
-        wget https://apt.llvm.org/llvm.sh
-        chmod +x llvm.sh
-        sudo ./llvm.sh 21
-        sudo apt-get update
-        sudo apt-get install -y libmpfr-dev libgmp-dev libmpc-dev ninja-build linux-libc-dev
-        sudo ln -sf /usr/include/$(uname -p)-linux-gnu/asm /usr/include/asm
-
-    - name: Set reusable strings
-      id: strings
-      shell: bash
-      run: |
-        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
-        echo "build-install-dir=${{ github.workspace }}/install" >> "$GITHUB_OUTPUT"
-    
-    # Configure libc fullbuild with scudo.
-    # Use MinSizeRel to reduce the size of the build.
-    - name: Configure CMake
-      run: |
-        export RUNTIMES="libc"
-
-        if [[ ${{ matrix.include_scudo}} == "ON" ]]; then
-          export RUNTIMES="$RUNTIMES;compiler-rt"
-          export CMAKE_FLAGS="
-            -DLLVM_LIBC_INCLUDE_SCUDO=ON
-            -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON
-            -DCOMPILER_RT_BUILD_GWP_ASAN=OFF
-            -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF"
-        fi
-
-        cmake -B ${{ steps.strings.outputs.build-output-dir }} \
-        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} \
-        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-        -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.ccache-variant }} \
-        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} \
-        -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
-        -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
-        -DLLVM_LIBC_FULL_BUILD=ON \
-        -G Ninja \
-        -S ${{ github.workspace }}/runtimes \
-        $CMAKE_FLAGS
-
-    - name: Build
-      run: >
-        cmake 
-        --build ${{ steps.strings.outputs.build-output-dir }} 
-        --parallel
-        --target install
-
-    - name: Test
-      # Skip UEFI tests until we have testing set up.
-      if: ${{ ! endsWith(matrix.target, '-uefi-llvm') }}
-      run: >
-        cmake 
-        --build ${{ steps.strings.outputs.build-output-dir }} 
-        --parallel
-        --target check-libc
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
deleted file mode 100644
index f001daae030a..000000000000
--- a/.github/workflows/libc-overlay-tests.yml
+++ /dev/null
@@ -1,120 +0,0 @@
-# This workflow is for pre-commit testing of the LLVM-libc project.
-name: LLVM-libc Pre-commit Overlay Tests
-permissions:
-  contents: read
-on:
-  pull_request:
-    branches: [ "main" ]
-    paths:
-      - 'libc/**'
-      - '.github/workflows/libc-overlay-tests.yml'
-
-jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations.
-      fail-fast: false
-      matrix:
-        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-14]
-        include:
-          # TODO: add linux gcc when it is fixed
-          - os: ubuntu-24.04
-            ccache-variant: sccache
-            compiler:
-              c_compiler: clang
-              cpp_compiler: clang++
-          # TODO: remove ccache logic when https://github.com/hendrikmuhs/ccache-action/issues/279 is resolved.
-          - os: ubuntu-24.04-arm
-            ccache-variant: ccache
-            compiler:
-              c_compiler: clang
-              cpp_compiler: clang++
-          - os: windows-2022
-            ccache-variant: sccache
-            compiler:
-              c_compiler: clang-cl
-              cpp_compiler: clang-cl
-          - os: windows-2025
-            ccache-variant: sccache
-            compiler:
-              c_compiler: clang-cl
-              cpp_compiler: clang-cl
-          - os: macos-14
-            ccache-variant: sccache
-            compiler:
-              c_compiler: clang
-              cpp_compiler: clang++
-    
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    
-    # Libc's build is relatively small comparing with other components of LLVM.
-    # A fresh linux overlay takes about 180MiB of uncompressed disk space, which can
-    # be compressed into ~40MiB. MacOS and Windows overlay builds are less than 10MiB
-    # after compression. Limiting the cache size to 1G should be enough.
-    # Prefer sccache as it is modern and it has a guarantee to work with MSVC.
-    # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects
-    # frequent small object writes.
-    - name: Setup ccache
-      uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-      with:
-        max-size: 1G
-        key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }}
-        variant: ${{ matrix.ccache-variant }}
-    
-    # MPFR is required by some of the mathlib tests.
-    - name: Prepare dependencies (Ubuntu)
-      if: runner.os == 'Linux'
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y libmpfr-dev libgmp-dev libmpc-dev ninja-build
-    
-    # Chocolatey is shipped with Windows runners. Windows Server 2025 recommends WinGet.
-    # Consider migrating to WinGet when Windows Server 2025 is available.
-    - name: Prepare dependencies (Windows)
-      if: runner.os == 'Windows'
-      run: |
-        choco install ninja
-    
-    - name: Prepare dependencies (macOS)
-      if: runner.os == 'macOS'
-      run: |
-        brew install ninja
-
-    - name: Set reusable strings
-      id: strings
-      shell: bash
-      run: |
-        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
-
-    # Use MinSizeRel to reduce the size of the build.
-    # Notice that CMP0141=NEW and MSVC_DEBUG_INFORMATION_FORMAT=Embedded are required
-    # by the sccache tool.
-    - name: Configure CMake
-      run: >
-        cmake -B ${{ steps.strings.outputs.build-output-dir }}
-        -DCMAKE_CXX_COMPILER=${{ matrix.compiler.cpp_compiler }}
-        -DCMAKE_C_COMPILER=${{ matrix.compiler.c_compiler }}
-        -DCMAKE_BUILD_TYPE=Debug
-        -DCMAKE_C_COMPILER_LAUNCHER=${{ matrix.ccache-variant }}
-        -DCMAKE_CXX_COMPILER_LAUNCHER=${{ matrix.ccache-variant }}
-        -DCMAKE_POLICY_DEFAULT_CMP0141=NEW
-        -DCMAKE_MSVC_DEBUG_INFORMATION_FORMAT=Embedded
-        -DLLVM_ENABLE_RUNTIMES=libc
-        -G Ninja
-        -S ${{ github.workspace }}/runtimes
-
-    - name: Build
-      run: >
-        cmake 
-        --build ${{ steps.strings.outputs.build-output-dir }} 
-        --parallel 
-        --target libc
-
-    - name: Test
-      run: >
-        cmake 
-        --build ${{ steps.strings.outputs.build-output-dir }} 
-        --parallel 
-        --target check-libc
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
deleted file mode 100644
index 4d47c07f4220..000000000000
--- a/.github/workflows/libclang-abi-tests.yml
+++ /dev/null
@@ -1,171 +0,0 @@
-name: libclang ABI Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/libclang-abi-tests.yml'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'clang/**'
-      - '.github/workflows/libclang-abi-tests.yml'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  abi-dump-setup:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    outputs:
-      BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
-      ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
-      ABI_LIBS: ${{ steps.vars.outputs.ABI_LIBS }}
-      BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
-      LLVM_VERSION_MAJOR: ${{ steps.version.outputs.major }}
-      LLVM_VERSION_MINOR: ${{ steps.version.outputs.minor }}
-      LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
-    steps:
-      - name: Checkout source
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 250
-
-      - name: Get LLVM version
-        id: version
-        uses: ./.github/workflows/get-llvm-version
-
-      - name: Setup Variables
-        id: vars
-        run: |
-          remote_repo='https://github.com/llvm/llvm-project'
-          if [ ${{ steps.version.outputs.patch }} -eq 0 ]; then
-            major_version=$(( ${{ steps.version.outputs.major }} - 1))
-            baseline_ref="llvmorg-$major_version.1.0"
-
-            # If there is a minor release, we want to use that as the base line.
-            minor_ref=$(git ls-remote --refs -t "$remote_repo" llvmorg-"$major_version".[1-9].[0-9] | tail -n1 | grep -o 'llvmorg-.\+' || true)
-            if [ -n "$minor_ref" ]; then
-               baseline_ref="$minor_ref"
-            else
-              # Check if we have a release candidate
-              rc_ref=$(git ls-remote --refs -t "$remote_repo" llvmorg-"$major_version".[1-9].[0-9]-rc* | tail -n1 | grep -o 'llvmorg-.\+' || true)
-              if [ -n "$rc_ref" ]; then
-                baseline_ref="$rc_ref"
-              fi
-            fi
-            {
-              echo "BASELINE_VERSION_MAJOR=$major_version"
-              echo "BASELINE_REF=$baseline_ref"
-              echo "ABI_HEADERS=clang-c"
-              echo "ABI_LIBS=libclang.so"
-            } >> "$GITHUB_OUTPUT"
-          else
-            {
-              echo "BASELINE_VERSION_MAJOR=${{ steps.version.outputs.major }}"
-              echo "BASELINE_REF=llvmorg-${{ steps.version.outputs.major }}.1.0"
-              echo "ABI_HEADERS=."
-              echo "ABI_LIBS=libclang.so libclang-cpp.so"
-            } >> "$GITHUB_OUTPUT"
-          fi
-
-  abi-dump:
-    if: github.repository_owner == 'llvm'
-    needs: abi-dump-setup
-    runs-on: ubuntu-24.04
-    strategy:
-      matrix:
-        name:
-          - build-baseline
-          - build-latest
-        include:
-          - name: build-baseline
-            llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: ${{ needs.abi-dump-setup.outputs.BASELINE_REF }}
-            repo: llvm/llvm-project
-          - name: build-latest
-            llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
-            ref: ${{ github.sha }}
-            repo: ${{ github.repository }}
-    steps:
-      - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y abi-dumper autoconf pkg-config
-      - name: Install universal-ctags
-        run: |
-          git clone https://github.com/universal-ctags/ctags.git
-          cd ctags
-          ./autogen.sh
-          ./configure
-          sudo make install
-      - name: Download source code
-        uses: llvm/actions/get-llvm-project-src@main
-        with:
-          ref: ${{ matrix.ref }}
-          repo: ${{ matrix.repo }}
-      - name: Configure
-        run: |
-          mkdir install
-          cmake -B build -S llvm -G Ninja -DLLVM_ENABLE_PROJECTS=clang -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DLLVM_LINK_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g1 -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g1 -Og" -DCMAKE_INSTALL_PREFIX="$(pwd)"/install llvm
-      - name: Build
-        run: ninja -C build/ ${{ needs.abi-dump-setup.outputs.ABI_LIBS }} install-clang-headers
-      - name: Dump ABI
-        run: |
-          parallel abi-dumper -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o {}-${{ matrix.ref }}.abi ./build/lib/{} ::: ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}
-          for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
-            # Remove symbol versioning from dumps, so we can compare across major versions.
-            sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
-          done
-      - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: ${{ matrix.name }}
-          path: '*${{ matrix.ref }}.abi'
-
-  abi-compare:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    needs:
-      - abi-dump-setup
-      - abi-dump
-    steps:
-      - name: Download baseline
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
-        with:
-          name: build-baseline
-          path: build-baseline
-      - name: Download latest
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
-        with:
-          name: build-latest
-          path: build-latest
-
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y abi-compliance-checker
-      - name: Compare ABI
-        run: |
-          for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do
-            abi-compliance-checker -lib $lib -old build-baseline/$lib*.abi -new build-latest/$lib*.abi
-          done
-      - name: Upload ABI Comparison
-        if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: compat-report-${{ github.sha }}
-          path: compat_reports/
diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
deleted file mode 100644
index 43b50cec6171..000000000000
--- a/.github/workflows/libclang-python-tests.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: Libclang Python Binding Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'main'
-    paths:
-      - 'clang/bindings/python/**'
-      - 'clang/test/bindings/python/**'
-      - 'clang/tools/libclang/**'
-      - '.github/workflows/libclang-python-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-  pull_request:
-    paths:
-      - 'clang/bindings/python/**'
-      - 'clang/test/bindings/python/**'
-      - 'clang/tools/libclang/**'
-      - '.github/workflows/libclang-python-tests.yml'
-      - '.github/workflows/llvm-project-tests.yml'
-
-jobs:
-  check-clang-python:
-    # Build libclang and then run the libclang Python binding's unit tests.
-    name: Build and run Python unit tests
-    if: github.repository == 'llvm/llvm-project'
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: ["3.8", "3.13"]
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-clang-python
-      projects: clang
-      # There is an issue running on "windows-2019".
-      # See https://github.com/llvm/llvm-project/issues/76601#issuecomment-1873049082.
-      os_list: '["ubuntu-24.04"]'
-      python_version: ${{ matrix.python-version }}
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
deleted file mode 100644
index ec937de02ca1..000000000000
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ /dev/null
@@ -1,285 +0,0 @@
-# This file defines pre-commit CI for libc++, libc++abi, and libunwind (on Github).
-#
-# We split the configurations in multiple stages with the intent of saving compute time
-# when a job fails early in the pipeline. This is why the jobs are marked as `continue-on-error: false`.
-# We try to run the CI configurations with the most signal in the first stage.
-#
-# Stages 1 & 2 are meant to be "smoke tests", and are meant to catch most build/test failures quickly and without using
-# too many resources.
-# Stage 3 is "everything else", and is meant to catch breakages on more niche or unique configurations.
-#
-# Therefore, we "fail-fast" for any failures during stages 1 & 2, meaning any job failing cancels all other running jobs,
-# under the assumption that if the "smoke tests" fail, then the other configurations will likely fail in the same way.
-# However, stage 3 does not fail fast, as it's more likely that any one job failing is a flake or a configuration-specific
-#
-name: Build and Test libc++
-on:
-  pull_request:
-    paths:
-      - 'libcxx/**'
-      - 'libcxxabi/**'
-      - 'libunwind/**'
-      - 'runtimes/**'
-      - 'cmake/**'
-      - '.github/workflows/libcxx-build-and-test.yaml'
-  schedule:
-    # Run nightly at 08:00 UTC (aka 00:00 Pacific, aka 03:00 Eastern)
-    - cron: '0 8 * * *'
-
-permissions:
-  contents: read # Default everything to read-only
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: true
-
-jobs:
-  stage1:
-    if: github.repository_owner == 'llvm'
-    runs-on: llvm-premerge-libcxx-runners
-    continue-on-error: false
-    strategy:
-      fail-fast: false
-      matrix:
-        config: [
-          'frozen-cxx03-headers',
-          'generic-cxx03',
-          'generic-cxx26',
-          'generic-modules'
-        ]
-        cc: [  'clang-21' ]
-        cxx: [ 'clang++-21' ]
-        include:
-          - config: 'generic-gcc'
-            cc: 'gcc-15'
-            cxx: 'g++-15'
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: ${{ matrix.config }}.${{ matrix.cxx }}
-        run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-        env:
-          CC: ${{ matrix.cc }}
-          CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
-        if: always()
-        with:
-          name: ${{ matrix.config }}-${{ matrix.cxx }}-results
-          path: |
-            **/test-results.xml
-            **/*.abilist
-            **/CMakeConfigureLog.yaml
-            **/CMakeError.log
-            **/CMakeOutput.log
-            **/crash_diagnostics/*
-  stage2:
-    if: github.repository_owner == 'llvm'
-    runs-on: llvm-premerge-libcxx-runners
-    needs: [ stage1 ]
-    continue-on-error: false
-    strategy:
-      fail-fast: false
-      matrix:
-        config: [
-          'generic-cxx11',
-          'generic-cxx14',
-          'generic-cxx17',
-          'generic-cxx20',
-          'generic-cxx23'
-        ]
-        cc: [ 'clang-21' ]
-        cxx: [ 'clang++-21' ]
-        include:
-          - config: 'generic-gcc-cxx11'
-            cc: 'gcc-15'
-            cxx: 'g++-15'
-          - config: 'generic-cxx26'
-            cc: 'clang-20'
-            cxx: 'clang++-20'
-          - config: 'generic-cxx26'
-            cc: 'clang-19'
-            cxx: 'clang++-19'
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: ${{ matrix.config }}
-        run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-        env:
-          CC: ${{ matrix.cc }}
-          CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
-        if: always()  # Upload artifacts even if the build or test suite fails
-        with:
-          name: ${{ matrix.config }}-${{ matrix.cxx }}-results
-          path: |
-            **/test-results.xml
-            **/*.abilist
-            **/CMakeConfigureLog.yaml
-            **/CMakeError.log
-            **/CMakeOutput.log
-            **/crash_diagnostics/*
-  stage3:
-    if: github.repository_owner == 'llvm'
-    needs: [ stage2 ]
-    continue-on-error: false
-    strategy:
-      fail-fast: false
-      max-parallel: 8
-      matrix:
-        config: [
-          'generic-abi-unstable',
-          'generic-hardening-mode-debug',
-          'generic-hardening-mode-extensive',
-          'generic-hardening-mode-fast',
-          'generic-hardening-mode-fast-with-abi-breaks',
-          'generic-merged',
-          'generic-modules-cxx17-lsv',
-          'generic-no-exceptions',
-          'generic-no-experimental',
-          'generic-no-filesystem',
-          'generic-no-localization',
-          'generic-no-terminal',
-          'generic-no-random_device',
-          'generic-no-threads',
-          'generic-no-tzdb',
-          'generic-no-unicode',
-          'generic-no-wide-characters',
-          'generic-no-rtti',
-          'generic-optimized-speed',
-          'generic-static',
-          'bootstrapping-build'
-        ]
-        machine: [ 'llvm-premerge-libcxx-runners' ]
-        include:
-        - config: 'generic-cxx26'
-          machine: llvm-premerge-libcxx-runners
-        - config: 'generic-asan'
-          machine: llvm-premerge-libcxx-runners
-        - config: 'generic-tsan'
-          machine: llvm-premerge-libcxx-runners
-        - config: 'generic-ubsan'
-          machine: llvm-premerge-libcxx-runners
-        # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
-        - config: 'generic-msan'
-          machine: llvm-premerge-libcxx-runners
-    runs-on: ${{ matrix.machine }}
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: ${{ matrix.config }}
-        run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-        env:
-          CC: clang-21
-          CXX: clang++-21
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
-        if: always()
-        with:
-          name: ${{ matrix.config }}-results
-          path: |
-            **/test-results.xml
-            **/*.abilist
-            **/CMakeConfigureLog.yaml
-            **/CMakeError.log
-            **/CMakeOutput.log
-            **/crash_diagnostics/*
-
-  macos:
-    needs: [ stage2 ]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        - config: generic-cxx03
-          os: macos-15
-        - config: generic-cxx23
-          os: macos-15
-        - config: generic-modules
-          os: macos-15
-        - config: apple-configuration
-          os: macos-15
-        # TODO: These jobs are intended to test back-deployment (building against ToT libc++ but running against an
-        #       older system-provided libc++.dylib). Doing this properly would require building the test suite on a
-        #       recent macOS using a recent Clang (hence recent Xcode), and then running the actual test suite on an
-        #       older mac. We could do that by e.g. sharing artifacts between the two jobs.
-        #
-        #       However, our Lit configuration currently doesn't provide a good way to do that in a batch, so our only
-        #       alternative is to actually build on the same host that we're going to run on. Sadly, that doesn't work
-        #       since older macOSes don't support newer Xcodes. For now, we run the "backdeployment" jobs on recent
-        #       macOS versions as a way to avoid rotting that configuration, but it doesn't provide a lot of additional
-        #       coverage.
-        - config: apple-system
-          os: macos-15
-        - config: apple-system-hardened
-          os: macos-15
-    runs-on: ${{ matrix.os }}
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
-        with:
-          # https://github.com/actions/runner-images/blob/main/images/macos/macos-15-Readme.md
-          xcode-version: '16.3'
-      - uses: seanmiddleditch/gha-setup-ninja@3b1f8f94a2f8254bd26914c4ab9474d4f0015f67 # v6
-      - name: Build and test
-        run: |
-          python3 -m venv .venv
-          source .venv/bin/activate
-          python -m pip install psutil
-          bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
-        if: always()  # Upload artifacts even if the build or test suite fails
-        with:
-          name: macos-${{ matrix.config }}-results
-          path: |
-            **/test-results.xml
-            **/*.abilist
-            **/CMakeConfigureLog.yaml
-            **/CMakeError.log
-            **/CMakeOutput.log
-            **/crash_diagnostics/*
-
-  windows:
-    runs-on: windows-2022
-    needs: [ stage2 ]
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-        - { config: clang-cl-dll, mingw: false }
-        - { config: clang-cl-static, mingw: false }
-        - { config: clang-cl-no-vcruntime, mingw: false }
-        - { config: clang-cl-debug, mingw: false }
-        - { config: clang-cl-static-crt, mingw: false }
-        - { config: mingw-dll, mingw: true }
-        - { config: mingw-static, mingw: true }
-        - { config: mingw-dll-i686, mingw: true }
-        - { config: mingw-incomplete-sysroot, mingw: true }
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Install dependencies
-        run: |
-          choco install -y ninja
-          pip install psutil
-      - name: Install a current LLVM
-        if: ${{ matrix.mingw != true }}
-        run: |
-          choco install -y llvm --version=19.1.7 --allow-downgrade
-      - name: Install llvm-mingw
-        if: ${{ matrix.mingw == true }}
-        run: |
-          curl -LO https://github.com/mstorsjo/llvm-mingw/releases/download/20250114/llvm-mingw-20250114-ucrt-x86_64.zip
-          powershell Expand-Archive llvm-mingw*.zip -DestinationPath .
-          del llvm-mingw*.zip
-          mv llvm-mingw* c:\llvm-mingw
-          echo "c:\llvm-mingw\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Simulate a from-scratch build of llvm-mingw
-        if: ${{ matrix.config == 'mingw-incomplete-sysroot' }}
-        run: |
-          rm -r c:\llvm-mingw\include\c++
-          rm -r c:\llvm-mingw\*-w64-mingw32\lib\libc++*
-          rm -r c:\llvm-mingw\*-w64-mingw32\lib\libunwind*
-      - name: Add Git Bash to the path
-        run: |
-          echo "c:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append
-      - name: Set up the MSVC dev environment
-        if: ${{ matrix.mingw != true }}
-        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
-      - name: Build and test
-        run: |
-          bash libcxx/utils/ci/run-buildbot ${{ matrix.config }}
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
deleted file mode 100644
index 564a79341edb..000000000000
--- a/.github/workflows/libcxx-build-containers.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-# This file defines an action that builds the various Docker images used to run
-# libc++ CI whenever modifications to those Docker files are pushed to `main`.
-#
-# The images are pushed to the LLVM package registry at https://github.com/orgs/llvm/packages
-# and tagged appropriately. The selection of which Docker image version is used by the libc++
-# CI nodes at any given point is controlled from the workflow files themselves.
-
-name: Build Docker images for libc++ CI
-
-permissions:
-  contents: read
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'libcxx/utils/ci/**'
-      - '.github/workflows/libcxx-build-containers.yml'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'libcxx/utils/ci/**'
-      - '.github/workflows/libcxx-build-containers.yml'
-
-jobs:
-  build-and-push:
-    runs-on: ubuntu-24.04
-    if: github.repository_owner == 'llvm'
-    permissions:
-      packages: write
-
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-    - name: Build the Linux builder image
-      working-directory: libcxx/utils/ci
-      run: docker compose build actions-builder
-      env:
-        TAG: ${{ github.sha }}
-
-    # - name: Build the Android builder image
-    #   working-directory: libcxx/utils/ci
-    #   run: docker compose build android-buildkite-builder
-    #   env:
-    #     TAG: ${{ github.sha }}
-
-    - name: Log in to GitHub Container Registry
-      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
-      with:
-        registry: ghcr.io
-        username: ${{ github.actor }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-
-    - name: Push the Linux builder image
-      if: github.event_name == 'push'
-      working-directory: libcxx/utils/ci
-      run: |
-        docker compose push actions-builder
-      env:
-        TAG: ${{ github.sha }}
-
-    # - name: Push the Android builder image
-    #   if: github.event_name == 'push'
-    #   working-directory: libcxx/utils/ci
-    #   run: |
-    #     docker compose push android-buildkite-builder
-    #   env:
-    #     TAG: ${{ github.sha }}
diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml
deleted file mode 100644
index 0226edd7aa17..000000000000
--- a/.github/workflows/libcxx-check-generated-files.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: "Check libc++ generated files"
-on:
-  pull_request:
-    paths:
-      - 'libcxx/**'
-
-permissions:
-  contents: read
-
-jobs:
-  check_generated_files:
-    runs-on: ubuntu-24.04
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Install dependencies
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
-        with:
-          clangformat: 17.0.1
-          ninja: true
-
-      - name: Check generated files
-        run: libcxx/utils/ci/run-buildbot check-generated-output
diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
deleted file mode 100644
index 5470662c9762..000000000000
--- a/.github/workflows/llvm-bugs.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: LLVM Bugs notifier
-
-permissions:
-  contents: read
-  issues: read
-
-on:
-  issues:
-    types:
-      - opened
-
-jobs:
-  auto-subscribe:
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - uses: actions/setup-node@1d0ff469b7ec7b3cb9d8673fde0c81c44821de2a # v4.2.0
-        with:
-          node-version: 18
-          check-latest: true
-      - run: npm install mailgun.js form-data
-      - name: Send notification
-        uses: actions/github-script@d7906e4ad0b1822421a7e6a35d5ca353c962f410 # v6.4.1
-        env:
-          MAILGUN_API_KEY: ${{ secrets.LLVM_BUGS_KEY }}
-        with:
-          script: |
-            const Mailgun = require('mailgun.js');
-            const formData = require('form-data');
-
-            const mailgun = new Mailgun(formData);
-            const DOMAIN = 'email.llvm.org';
-
-            const mg = mailgun.client({ username: 'api', key: process.env.MAILGUN_API_KEY });
-
-            github.rest.issues.get({
-              issue_number: context.issue.number,
-              owner: context.repo.owner,
-              repo: context.repo.repo
-            })
-            .then((issue) => {
-              const payload = {
-                author : issue.data.user.login,
-                issue  : issue.data.number,
-                title  : issue.data.title,
-                url    : issue.data.html_url,
-                labels : issue.data.labels.map((label) => label.name),
-                assignee : issue.data.assignees.map((assignee) => assignee.login),
-                body   : issue.data.body
-              };
-
-              const data = {
-                from: 'LLVM Bugs <llvm-bugs@email.llvm.org>',
-                to: 'llvm-bugs@lists.llvm.org',
-                subject: `[Bug ${issue.data.number}] ${issue.data.title}`,
-                template: 'new-github-issue',
-                'o:tracking-clicks': 'no',
-                'h:X-Mailgun-Variables': JSON.stringify(payload)
-              };
-
-              return mg.messages.create(DOMAIN, data);
-            })
-            .then((msg) => console.log(msg));
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
deleted file mode 100644
index d40ed5babb45..000000000000
--- a/.github/workflows/llvm-project-tests.yml
+++ /dev/null
@@ -1,149 +0,0 @@
-name: LLVM Project Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-    inputs:
-      build_target:
-        required: false
-      projects:
-        required: false
-      extra_cmake_args:
-        required: false
-      os_list:
-        required: false
-        default: '["ubuntu-24.04", "windows-2019", "macOS-13"]'
-      python_version:
-        required: false
-        type: string
-        default: '3.11'
-  workflow_call:
-    inputs:
-      build_target:
-        required: false
-        type: string
-        default: "all"
-
-      projects:
-        required: true
-        type: string
-
-      extra_cmake_args:
-        required: false
-        type: string
-
-      os_list:
-        required: false
-        type: string
-        # Use windows-2019 due to:
-        # https://developercommunity.visualstudio.com/t/Prev-Issue---with-__assume-isnan-/1597317
-        default: '["ubuntu-24.04", "windows-2019", "macOS-13"]'
-
-      python_version:
-        required: false
-        type: string
-        default: '3.11'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  # If the group name here is the same as the group name in the workflow that includes
-  # this one, then the action will try to wait on itself and get stuck.
-  group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}-${{ inputs.python_version }}${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  lit-tests:
-    name: Lit Tests
-    runs-on: ${{ matrix.os }}
-    container:
-      image: ${{(startsWith(matrix.os, 'ubuntu') && 'ghcr.io/llvm/ci-ubuntu-24.04:latest') || null}}
-      volumes:
-        - /mnt/:/mnt/
-    strategy:
-      fail-fast: false
-      matrix:
-        os: ${{ fromJSON(inputs.os_list) }}
-    steps:
-      - name: Setup Windows
-        if: startsWith(matrix.os, 'windows')
-        uses: llvm/actions/setup-windows@main
-        with:
-          arch: amd64
-      # On Windows, starting with win19/20220814.1, cmake choose the 32-bit
-      # python3.10.6 libraries instead of the 64-bit libraries when building
-      # lldb.  Using this setup-python action to make 3.10 the default
-      # python fixes this.
-      - name: Setup Python
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          python-version: ${{ inputs.python_version }}
-      - name: Install Ninja
-        if: runner.os != 'Linux'
-        uses: llvm/actions/install-ninja@main
-      # actions/checkout deletes any existing files in the new git directory,
-      # so this needs to either run before ccache-action or it has to use
-      # clean: false.
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 250
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          # A full build of llvm, clang, lld, and lldb takes about 250MB
-          # of ccache space. There's not much reason to have more than this,
-          # because we usually won't need to save cache entries from older
-          # builds.  Also, there is an overall 10GB cache limit, and each
-          # run creates a new cache entry so we want to ensure that we have
-          # enough cache space for all the tests to run at once and still
-          # fit under the 10 GB limit.
-          # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
-          max-size: 2G
-          key: ${{ matrix.os }}
-          variant: sccache
-      - name: Build and Test
-        env:
-          # Workaround for https://github.com/actions/virtual-environments/issues/5900.
-          # This should be a no-op for non-mac OSes
-          PKG_CONFIG_PATH: /usr/local/Homebrew/Library/Homebrew/os/mac/pkgconfig//12
-        shell: bash
-        id: build-llvm
-        run: |
-          if [ "${{ runner.os }}" == "Linux" ]; then
-            builddir="/mnt/build/"
-            sudo mkdir -p $builddir
-            sudo chown gha $builddir
-            extra_cmake_args="-DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang"
-          else
-            builddir="$(pwd)"/build
-          fi
-          if [ "${{ runner.os }}" == "macOS" ]; then
-            # Workaround test failure on some lld tests on MacOS
-            # https://github.com/llvm/llvm-project/issues/81967
-            extra_cmake_args="-DLLVM_DISABLE_ASSEMBLY_FILES=ON"
-          fi
-          echo "llvm-builddir=$builddir" >> "$GITHUB_OUTPUT"
-          cmake -G Ninja \
-                -B "$builddir" \
-                -S llvm \
-                -DLLVM_ENABLE_PROJECTS="${{ inputs.projects }}" \
-                -DCMAKE_BUILD_TYPE=Release \
-                -DLLVM_ENABLE_ASSERTIONS=ON \
-                -DLLDB_INCLUDE_TESTS=OFF \
-                -DLIBCLC_TARGETS_TO_BUILD="amdgcn--;amdgcn--amdhsa;r600--;nvptx--;nvptx64--;nvptx--nvidiacl;nvptx64--nvidiacl" \
-                -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-                -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-                $extra_cmake_args \
-                ${{ inputs.extra_cmake_args }}
-          ninja -C "$builddir" '${{ inputs.build_target }}'
-
-      - name: Build and Test libclc
-        if: "!startsWith(matrix.os, 'windows') && contains(inputs.projects, 'libclc')"
-        env:
-          LLVM_BUILDDIR: ${{ steps.build-llvm.outputs.llvm-builddir }}
-        run: |
-          # The libclc tests don't have a generated check target so all we can
-          # do is build it.
-          ninja -C "$LLVM_BUILDDIR"
diff --git a/.github/workflows/llvm-project-workflow-tests.yml b/.github/workflows/llvm-project-workflow-tests.yml
deleted file mode 100644
index a2539b279be0..000000000000
--- a/.github/workflows/llvm-project-workflow-tests.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-# This workflow will test the llvm-project-tests workflow in PRs
-# targetting the main branch.  Since this workflow doesn't normally
-# run on main PRs, we need some way to test it to ensure new updates
-# don't break it.
-
-name: LLVM Workflow Test
-
-permissions:
-  contents: read
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-    paths:
-      - '.github/workflows/llvm-project-tests.yml'
-      - '.github/workflows/llvm-project-workflow-tests.yml'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  llvm-test:
-    if: github.repository_owner == 'llvm'
-    name: Build and Test
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-all
-      projects: clang;lld;libclc;lldb
diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml
deleted file mode 100644
index a9bd8db462cf..000000000000
--- a/.github/workflows/llvm-tests.yml
+++ /dev/null
@@ -1,185 +0,0 @@
-name: LLVM Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - 'release/**'
-    paths:
-      - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
-  pull_request:
-    branches:
-      - 'release/**'
-    paths:
-      - 'llvm/**'
-      - '.github/workflows/llvm-tests.yml'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  abi-dump-setup:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    outputs:
-      BASELINE_REF: ${{ steps.vars.outputs.BASELINE_REF }}
-      ABI_HEADERS: ${{ steps.vars.outputs.ABI_HEADERS }}
-      BASELINE_VERSION_MAJOR: ${{ steps.vars.outputs.BASELINE_VERSION_MAJOR }}
-      BASELINE_VERSION_MINOR: ${{ steps.vars.outputs.BASELINE_VERSION_MINOR }}
-      LLVM_VERSION_MAJOR: ${{ steps.version.outputs.major }}
-      LLVM_VERSION_MINOR: ${{ steps.version.outputs.minor }}
-      LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
-    steps:
-      - name: Checkout source
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 250
-
-      - name: Get LLVM version
-        id: version
-        uses: ./.github/workflows/get-llvm-version
-
-      - name: Setup Variables
-        id: vars
-        run: |
-          # C++ ABI:
-          # 18.1.0 we aren't doing ABI checks.
-          # 18.1.1 We want to check 18.1.0.
-          # C ABI:
-          # 18.1.0 We want to check 17.0.x
-          # 18.1.1 We want to check 18.1.0
-          echo "BASELINE_VERSION_MINOR=1" >> "$GITHUB_OUTPUT"
-          if [ ${{ steps.version.outputs.patch }} -eq 0 ]; then
-            {
-              echo "BASELINE_VERSION_MAJOR=$(( ${{ steps.version.outputs.major }} - 1))"
-              echo "ABI_HEADERS=llvm-c"
-            } >> "$GITHUB_OUTPUT"
-          else
-            {
-              echo "BASELINE_VERSION_MAJOR=${{ steps.version.outputs.major }}"
-              echo "ABI_HEADERS=."
-            } >> "$GITHUB_OUTPUT"
-          fi
-
-  abi-dump:
-    if: github.repository_owner == 'llvm'
-    needs: abi-dump-setup
-    runs-on: ubuntu-24.04
-    strategy:
-      matrix:
-        name:
-          - build-baseline
-          - build-latest
-        include:
-          - name: build-baseline
-            llvm_version_major: ${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}
-            ref: llvmorg-${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MAJOR }}.${{ needs.abi-dump-setup.outputs.BASELINE_VERSION_MINOR }}.0
-            repo: llvm/llvm-project
-          - name: build-latest
-            llvm_version_major: ${{ needs.abi-dump-setup.outputs.LLVM_VERSION_MAJOR }}
-            ref: ${{ github.sha }}
-            repo: ${{ github.repository }}
-    steps:
-      - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-dumper autoconf pkg-config
-      - name: Install universal-ctags
-        run: |
-          git clone https://github.com/universal-ctags/ctags.git
-          cd ctags
-          ./autogen.sh
-          ./configure
-          sudo make install
-      - name: Download source code
-        uses: llvm/actions/get-llvm-project-src@main
-        with:
-          ref: ${{ matrix.ref }}
-          repo: ${{ matrix.repo }}
-      - name: Configure
-        run: |
-          mkdir install
-          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="" -DLLVM_BUILD_LLVM_DYLIB=ON -DCMAKE_C_FLAGS_DEBUG="-g1 -Og" -DCMAKE_CXX_FLAGS_DEBUG="-g1 -Og" -DCMAKE_INSTALL_PREFIX="$(pwd)"/install llvm
-      - name: Build
-        # Need to run install-LLVM twice to ensure the symlink is installed (this is a bug).
-        run: |
-          ninja -C build install-LLVM
-          ninja -C build install-LLVM
-          ninja -C build install-llvm-headers
-      - name: Dump ABI
-        run: |
-          if [ "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" ]; then
-            nm ./install/lib/libLLVM.so | awk "/T _LLVM/ || /T LLVM/ { print $3 }" | sort -u | sed -e "s/^_//g" | cut -d ' ' -f 3 > llvm.symbols
-            # Even though the -symbols-list option doesn't seem to filter out the symbols, I believe it speeds up processing, so I'm leaving it in.
-            export EXTRA_ARGS="-symbols-list llvm.symbols"
-          else
-            touch llvm.symbols
-          fi
-          abi-dumper $EXTRA_ARGS -lver ${{ matrix.ref }} -skip-cxx -public-headers ./install/include/${{ needs.abi-dump-setup.outputs.ABI_HEADERS }} -o ${{ matrix.ref }}.abi ./install/lib/libLLVM.so
-          # Remove symbol versioning from dumps, so we can compare across major versions.
-          sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
-      - name: Upload ABI file
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: ${{ matrix.name }}
-          path: ${{ matrix.ref }}.abi
-
-      - name: Upload symbol list file
-        if: matrix.name == 'build-baseline'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: symbol-list
-          path: llvm.symbols
-
-  abi-compare:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    needs:
-      - abi-dump-setup
-      - abi-dump
-    steps:
-      - name: Download baseline
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
-        with:
-          name: build-baseline
-          path: build-baseline
-      - name: Download latest
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
-        with:
-          name: build-latest
-          path: build-latest
-      - name: Download symbol list
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # 4.1.8
-        with:
-          name: symbol-list
-          path: symbol-list
-
-      - name: Install abi-compliance-checker
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install abi-compliance-checker
-      - name: Compare ABI
-        run: |
-          if [ -s symbol-list/llvm.symbols ]; then
-            # This option doesn't seem to work with the ABI dumper, so passing it here.
-            export EXTRA_ARGS="-symbols-list symbol-list/llvm.symbols"
-          fi
-          # FIXME: Reading of gzip'd abi files on the GitHub runners stop
-          # working some time in March of 2021, likely due to a change in the
-          # runner's environment.
-          abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
-      - name: Upload ABI Comparison
-        if: always()
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: compat-report-${{ github.sha }}
-          path: compat_reports/
diff --git a/.github/workflows/merged-prs.yml b/.github/workflows/merged-prs.yml
deleted file mode 100644
index c77173638980..000000000000
--- a/.github/workflows/merged-prs.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: "Add buildbot information to first PRs from new contributors"
-
-permissions:
-  contents: read
-
-on:
-  # It's safe to use pull_request_target here, because we aren't checking out
-  # code from the pull request branch.
-  # See https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-  pull_request_target:
-    types:
-      - closed
-
-jobs:
-  buildbot_comment:
-    runs-on: ubuntu-24.04
-    permissions:
-      pull-requests: write
-    if: >-
-      (github.repository == 'llvm/llvm-project') &&
-      (github.event.pull_request.merged == true)
-    steps:
-      - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: llvm/utils/git/
-          ref: main
-
-      - name: Setup Automation Script
-        working-directory: ./llvm/utils/git/
-        run: |
-          pip install --require-hashes -r requirements.txt
-
-      - name: Add Buildbot information comment
-        working-directory: ./llvm/utils/git/
-        run: |
-          python3 ./github-automation.py \
-            --token '${{ secrets.GITHUB_TOKEN }}' \
-            pr-buildbot-information \
-            --issue-number "${{ github.event.pull_request.number }}" \
-            --author "${{ github.event.pull_request.user.login }}"
diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
deleted file mode 100644
index 8480a657cc71..000000000000
--- a/.github/workflows/new-issues.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Labeling new issues
-on:
-  issues:
-    types: ['opened']
-
-permissions:
-  contents: read
-
-jobs:
-  automate-issues-labels:
-    permissions:
-      issues: write
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - uses: llvm/actions/issue-labeler@main
-        with:
-          repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
-          configuration-path: .github/new-issues-labeler.yml
-          include-title: 1
-          include-body: 0
-          sync-labels: 0
-          enable-versioned-regex: 0
diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml
deleted file mode 100644
index 935598e410db..000000000000
--- a/.github/workflows/new-prs.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-name: "Labelling new pull requests"
-
-permissions:
-  contents: read
-
-on:
-  # It's safe to use pull_request_target here, because we aren't checking out
-  # code from the pull request branch.
-  # See https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-  pull_request_target:
-    types:
-      - opened
-      - reopened
-      - ready_for_review
-      - synchronize
-
-jobs:
-  greeter:
-    runs-on: ubuntu-24.04
-    permissions:
-      pull-requests: write
-    # Only comment on PRs that have been opened for the first time, by someone
-    # new to LLVM or to GitHub as a whole. Ideally we'd look for FIRST_TIMER
-    # or FIRST_TIME_CONTRIBUTOR, but this does not appear to work. Instead check
-    # that we do not have any of the other author associations.
-    # See https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=opened#pull_request
-    # for all the possible values.
-    if: >-
-      (github.repository == 'llvm/llvm-project') &&
-      (github.event.action == 'opened') &&
-      (github.event.pull_request.author_association != 'COLLABORATOR') &&
-      (github.event.pull_request.author_association != 'CONTRIBUTOR') &&
-      (github.event.pull_request.author_association != 'MANNEQUIN') &&
-      (github.event.pull_request.author_association != 'MEMBER') &&
-      (github.event.pull_request.author_association != 'OWNER')
-    steps:
-      - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: llvm/utils/git/
-          ref: main
-
-      - name: Setup Automation Script
-        working-directory: ./llvm/utils/git/
-        run: |
-          pip install --require-hashes -r requirements.txt
-
-      - name: Greet Author
-        working-directory: ./llvm/utils/git/
-        run: |
-          python3 ./github-automation.py \
-            --token '${{ secrets.GITHUB_TOKEN }}' \
-            pr-greeter \
-            --issue-number "${{ github.event.pull_request.number }}"
-
-  automate-prs-labels:
-    # Greet first so that only the author gets that notification.
-    needs: greeter
-    runs-on: ubuntu-24.04
-    # Ignore PRs with more than 10 commits.  Pull requests with a lot of
-    # commits tend to be accidents usually when someone made a mistake while trying
-    # to rebase.  We want to ignore these pull requests to avoid excessive
-    # notifications.
-    # always() means that even if greeter is skipped, this job will run.
-    if: >
-      always() && github.repository == 'llvm/llvm-project' &&
-      github.event.pull_request.draft == false &&
-      github.event.pull_request.commits < 10
-    steps:
-      - uses: actions/labeler@ac9175f8a1f3625fd0d4fb234536d26811351594 # v4.3.0
-        with:
-          configuration-path: .github/new-prs-labeler.yml
-          # workaround for https://github.com/actions/labeler/issues/112
-          sync-labels: ''
-          repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
deleted file mode 100644
index 70bcaafbd0cf..000000000000
--- a/.github/workflows/pr-code-format.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-name: "Check code formatting"
-
-permissions:
-  contents: read
-
-on:
-  pull_request:
-    branches:
-      - main
-      - 'users/**'
-
-jobs:
-  code_formatter:
-    runs-on: ubuntu-24.04
-    timeout-minutes: 30
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-      cancel-in-progress: true
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 2
-
-      - name: Get changed files
-        id: changed-files
-        uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
-        with:
-          separator: ","
-          skip_initial_fetch: true
-          base_sha: 'HEAD~1'
-          sha: 'HEAD'
-
-      # We need to pull the script from the main branch, so that we ensure
-      # we get the latest version of this script.
-      - name: Fetch code formatting utils
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: ${{ github.repository }}
-          ref: ${{ github.base_ref }}
-          sparse-checkout: |
-            llvm/utils/git/requirements_formatting.txt
-            llvm/utils/git/code-format-helper.py
-          sparse-checkout-cone-mode: false
-          path: code-format-tools
-
-      - name: "Listed files"
-        env:
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-        run: |
-          echo "Formatting files:"
-          echo "$CHANGED_FILES"
-
-      - name: Install clang-format
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
-        with:
-          clangformat: 20.1.8
-
-      - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          python-version: '3.11'
-          cache: 'pip'
-          cache-dependency-path: 'code-format-tools/llvm/utils/git/requirements_formatting.txt'
-
-      - name: Install python dependencies
-        run: pip install -r code-format-tools/llvm/utils/git/requirements_formatting.txt
-
-      - name: Run code formatter
-        env:
-          GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
-          START_REV: ${{ github.event.pull_request.base.sha }}
-          END_REV: ${{ github.event.pull_request.head.sha }}
-          CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-        # Create an empty comments file so the pr-write job doesn't fail.
-        run: |
-          echo "[]" > comments &&
-          python ./code-format-tools/llvm/utils/git/code-format-helper.py \
-            --write-comment-to-file \
-            --token ${{ secrets.GITHUB_TOKEN }} \
-            --issue-number $GITHUB_PR_NUMBER \
-            --start-rev HEAD~1 \
-            --end-rev HEAD \
-            --changed-files "$CHANGED_FILES"
-
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-        if: always()
-        with:
-          name: workflow-args
-          path: |
-            comments
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
deleted file mode 100644
index 57425e04ec2f..000000000000
--- a/.github/workflows/pr-request-release-note.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: PR Request Release Note
-
-permissions:
-  contents: read
-
-on:
-  pull_request:
-    types:
-      - closed
-
-jobs:
-  request-release-note:
-    if: >-
-      github.repository_owner == 'llvm' &&
-      startsWith(github.ref, 'refs/heads/release')
-
-    runs-on: ubuntu-24.04
-    steps:
-      # We need to pull the script from the main branch, so that we ensure
-      # we get the latest version of this script.
-      - name: Checkout Scripts
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          sparse-checkout: |
-            llvm/utils/git/requirements.txt
-            llvm/utils/git/github-automation.py
-          sparse-checkout-cone-mode: false
-
-      - name: Install Dependencies
-        run: |
-          pip install --require-hashes -r llvm/utils/git/requirements.txt
-
-      - name: Request Release Note
-        env:
-          # We need to use an llvmbot token here, because we are mentioning a user.
-          GITHUB_TOKEN: ${{ github.token }}
-        run: |
-          python3 llvm/utils/git/github-automation.py \
-            --repo "$GITHUB_REPOSITORY" \
-            --token "$GITHUB_TOKEN" \
-            request-release-note \
-            --pr-number ${{ github.event.pull_request.number}}
-
-      - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-        if: always()
-        with:
-          name: workflow-args
-          path: |
-            comments
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
deleted file mode 100644
index f558da8a8fe0..000000000000
--- a/.github/workflows/pr-subscriber.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-name: PR Subscriber
-
-on:
-  pull_request_target:
-    types:
-      - labeled
-
-permissions:
-  contents: read
-
-jobs:
-  auto-subscribe:
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Checkout Automation Script
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          sparse-checkout: llvm/utils/git/
-          ref: main
-
-      - name: Setup Automation Script
-        working-directory: ./llvm/utils/git/
-        run: |
-          pip install --require-hashes -r requirements.txt
-
-      - name: Update watchers
-        working-directory: ./llvm/utils/git/
-        run: |
-          python3 ./github-automation.py \
-            --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
-            pr-subscriber \
-            --issue-number "${{ github.event.number }}" \
-            --label-name "${{ github.event.label.name }}"
diff --git a/.github/workflows/pre-checkin.yml b/.github/workflows/pre-checkin.yml
new file mode 100644
index 000000000000..05e48d4f9e9a
--- /dev/null
+++ b/.github/workflows/pre-checkin.yml
@@ -0,0 +1,403 @@
+name: pre-checkin
+
+on:
+  pull_request:
+  workflow_call:
+    inputs:
+      repo_name:
+        required: true
+        type: string
+      pr_branch:
+        required: true
+        type: string
+
+jobs:
+  build:
+    runs-on: self-hosted
+    concurrency:
+      group: "${{ github.workflow }}-${{ github.event.pull_request.number }}"
+      cancel-in-progress: true
+    permissions:
+      contents: write
+      statuses: write
+
+    env:
+      REPO_NAME: "${{ github.event_name == 'workflow_call' && inputs.repo_name || github.repository }}"
+      PR_BRANCH: "${{ github.event_name == 'workflow_call' && inputs.pr_branch || github.head_ref }}"
+      BASE_BRANCH_NAME: "${{ github.event.pull_request.base.ref }}"
+      SRC_DIR: "${{ github.workspace }}/src"
+      BUILD_DIR: "${{ github.workspace }}/build"
+      INSTALL_DIR: "${{ github.workspace }}/install"
+      CCACHE_DIR: "/local/mnt/workspace/cpullvm_ccache/${{ github.event.pull_request.base.ref }}"
+      CCACHE_BASEDIR: "${{ github.workspace }}/pr-${{ github.event.number }}"
+      BUILD_MODE: "Release"
+      ASSERTION_MODE: "OFF"
+      ARM32_LINUX_TRIPLE: "arm-linux-gnueabi"
+      AARCH64_LINUX_TRIPLE: "aarch64-linux-gnu"
+      COMPILER_RT_ARM32_LINUX_BUILDDIR: "${{ github.workspace }}/build/compiler-rt/arm32/linux"
+      COMPILER_RT_AARCH64_LINUX_BUILDDIR: "${{ github.workspace }}/build/compiler-rt/aarch64/linux"
+      COMPILER_RT_ARM32_LINUX_FLAGS: "--target=arm-linux-gnueabi -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+      COMPILER_RT_AARCH64_LINUX_FLAGS: "--sysroot=/usr/aarch64-linux-gnu --target=aarch64-linux-gnu -mcpu=cortex-a53"
+      ARM32_BM_TRIPLE: "arm-none-eabi"
+      AARCH64_BM_TRIPLE: "aarch64-none-elf"
+      COMPILER_RT_ARM32_BM_BUILDDIR: "${{ github.workspace }}/build/compiler-rt/arm32/baremetal"
+      COMPILER_RT_AARCH64_BM_BUILDDIR: "${{ github.workspace }}/build/compiler-rt/aarch64/baremetal"
+      COMPILER_RT_ARM32_BM_FLAGS: "--target=arm-none-eabi -mcpu=cortex-a9 -ffunction-sections -fdata-sections -mfloat-abi=softfp -mfpu=neon -nostdlibinc"
+      COMPILER_RT_AARCH64_BM_FLAGS: "--target=aarch64-none-elf -mcpu=cortex-a53 -ffunction-sections -fdata-sections -nostdlibinc"
+
+    steps:
+      - name: Workspace Cleanup
+        shell: bash
+        run: |
+          echo "Clean workspace dir..."
+          rm -rf "${{ github.workspace }}"/*
+
+      - name: Create source root
+        shell: bash
+        run: mkdir -p "${SRC_DIR}"
+
+      - name: Checkout qualcomm/cpullvm-toolchain
+        uses: actions/checkout@v4
+        with:
+          repository: qualcomm/cpullvm-toolchain
+          ref: "${{ env.BASE_BRANCH_NAME }}"
+          path: "${{ env.SRC_DIR }}/llvm-top"
+          fetch-depth: 0
+
+      - name: Fetch musl-embedded
+        shell: bash
+        run: |
+          set -euo pipefail
+          git clone https://github.com/qualcomm/musl-embedded.git "${{ env.SRC_DIR }}/musl-embedded"
+
+      - name: Fetch qualcomm/eld
+        shell: bash
+        run: |
+          set -euo pipefail
+          mkdir -p "${SRC_DIR}/llvm-top/llvm/tools"
+          git clone https://github.com/qualcomm/eld.git "${SRC_DIR}/llvm-top/llvm/tools/eld" -b "release/21.x"
+          cd "${SRC_DIR}/llvm-top/llvm/tools/eld"
+          git checkout 65ea860802c41ef5c0becff9750a350495de27b0
+
+      - name: Fetch and update PR repository
+        shell: bash
+        run: |
+          set -euo pipefail
+          echo "PR repository: ${REPO_NAME}"
+          if [[ "${REPO_NAME}" == */cpullvm-toolchain ]]; then
+            cd "${SRC_DIR}/llvm-top"
+          elif [[ "${REPO_NAME}" == */musl-embedded ]]; then
+            cd "${SRC_DIR}/musl-embedded"
+          else
+            echo "Unsupported repository: ${REPO_NAME}"
+            exit 1
+          fi
+          git fetch origin
+          git reset --hard "origin/${PR_BRANCH}"
+
+      - name: Apply patches
+        shell: bash
+        run: |
+          python3 "${SRC_DIR}/llvm-top/qualcomm-software/embedded/tools/patchctl.py" apply -f "${SRC_DIR}/llvm-top/qualcomm-software/embedded/patchsets.yml"
+
+      - name: Run CMake (LLVM)
+        shell: bash
+        run: |
+          mkdir -p "${BUILD_DIR}/llvm"
+          cd "${BUILD_DIR}/llvm"
+          mkdir -p "${CCACHE_DIR}"
+          ccache --zero-stats
+          ccache --set-config "max_size=50GB" "compression=true"
+          ccache --show-config
+
+          cmake -G Ninja -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
+            -DLLVM_TARGETS_TO_BUILD="ARM;AArch64" \
+            -DLLVM_EXTERNAL_PROJECTS=eld \
+            -DLLVM_EXTERNAL_ELD_SOURCE_DIR="llvm-top/llvm/tools/eld" \
+            -DLLVM_DEFAULT_TARGET_TRIPLE="aarch64-unknown-linux-gnueabi" \
+            -DLLVM_TARGET_ARCH="arm-linux-gnueabi" \
+            -DLLVM_LIT_ARGS="-v" -DLLVM_BUILD_RUNTIME="OFF" \
+            -DLIBCLANG_BUILD_STATIC="ON" -DLLVM_POLLY_LINK_INTO_TOOLS="ON" \
+            -DCMAKE_C_COMPILER_LAUNCHER="ccache" \
+            -DCMAKE_CXX_COMPILER_LAUNCHER="ccache" \
+            -DCMAKE_C_COMPILER="clang" -DCMAKE_CXX_COMPILER="clang++" \
+            -DCMAKE_CXX_FLAGS="-stdlib=libc++" \
+            -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+            -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+            -DLLVM_ENABLE_PROJECTS="llvm;clang;polly;lld;mlir" \
+            "${SRC_DIR}/llvm-top/llvm"
+          ninja
+          ninja install
+
+      - name: llvm.tests
+        shell: bash
+        run: |
+          cd "${BUILD_DIR}/llvm"
+          export HOME=/tmp/home-dir
+          mkdir -p "$HOME"
+          ninja -v check-llvm
+
+      - name: lld.tests
+        shell: bash
+        run: |
+          cd "${BUILD_DIR}/llvm"
+          ninja -v check-lld
+
+      - name: polly.tests
+        shell: bash
+        run: |
+          cd "${BUILD_DIR}/llvm"
+          ninja -v check-polly
+
+      - name: eld.tests
+        shell: bash
+        run: |
+          cd "${BUILD_DIR}/llvm"
+          ninja -v check-eld
+
+      - name: clang.tests
+        shell: bash
+        run: |
+          cd "${BUILD_DIR}/llvm"
+          ninja -v check-clang
+
+      - name: Compute clang RESOURCE_DIR
+        id: resource_dir
+        shell: bash
+        run: |
+          echo "value=$("${INSTALL_DIR}/bin/clang" -print-resource-dir)" >> "${GITHUB_OUTPUT}"
+
+      - name: Compiler-rt arm-linux
+        shell: bash
+        env:
+          RESOURCE_DIR: "${{ steps.resource_dir.outputs.value }}"
+        run: |
+          mkdir -p "${COMPILER_RT_ARM32_LINUX_BUILDDIR}"
+          echo -e "\nCOMPILER_RT for arm-linux:"
+          cd "${COMPILER_RT_ARM32_LINUX_BUILDDIR}"
+          cmake -G Ninja \
+            -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+            -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+            -DCMAKE_ASM_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+            -DCMAKE_C_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+            -DCMAKE_CXX_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+            -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+            -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+            -DCMAKE_C_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+            -DCMAKE_ASM_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+            -DCMAKE_SYSTEM_NAME=Generic \
+            -DCOMPILER_RT_BUILD_BUILTINS=ON \
+            -DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+            -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON \
+            -DCOMPILER_RT_OS_DIR="linux" \
+            -DCOMPILER_RT_TEST_TARGET_TRIPLE="${ARM32_LINUX_TRIPLE}" \
+            -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+            -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+            -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+            -DCXX_SUPPORTS_UNWINDLIB_NONE_FLAG:BOOL=OFF \
+            "${SRC_DIR}/llvm-top/compiler-rt"
+          ninja
+          ninja install
+
+      - name: Compiler-rt arm-baremetal
+        shell: bash
+        env:
+          RESOURCE_DIR: "${{ steps.resource_dir.outputs.value }}"
+        run: |
+          mkdir -p "${COMPILER_RT_ARM32_BM_BUILDDIR}"
+          echo -e "\nCOMPILER_RT for arm-baremetal:"
+          cd "${COMPILER_RT_ARM32_BM_BUILDDIR}"
+          cmake -G Ninja \
+            -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+            -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+            -DCMAKE_ASM_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+            -DCMAKE_C_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+            -DCMAKE_CXX_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+            -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+            -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+            -DCMAKE_C_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+            -DCMAKE_ASM_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+            -DCMAKE_SYSTEM_NAME=Generic \
+            -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+            -DCOMPILER_RT_BAREMETAL_BUILD=ON \
+            -DCOMPILER_RT_BUILD_BUILTINS=ON \
+            -DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+            -DCOMPILER_RT_BUILD_PROFILE=OFF \
+            -DCOMPILER_RT_BUILD_SANITIZERS=OFF \
+            -DCOMPILER_RT_BUILD_XRAY=OFF \
+            -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${ARM32_BM_TRIPLE}" \
+            -DCOMPILER_RT_OS_DIR="baremetal" \
+            -DCOMPILER_RT_TEST_TARGET_TRIPLE="${ARM32_BM_TRIPLE}" \
+            -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+            -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+            -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+            -DCXX_SUPPORTS_UNWINDLIB_NONE_FLAG:BOOL=OFF \
+            "${SRC_DIR}/llvm-top/compiler-rt"
+          ninja -t clean all
+          ninja -v
+          ninja install
+
+      - name: Compiler-rt aarch64-linux
+        shell: bash
+        env:
+          RESOURCE_DIR: "${{ steps.resource_dir.outputs.value }}"
+        run: |
+          mkdir -p "${COMPILER_RT_AARCH64_LINUX_BUILDDIR}"
+          cd "${COMPILER_RT_AARCH64_LINUX_BUILDDIR}"
+          cmake -G Ninja \
+            -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+            -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+            -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+            -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+            -DCMAKE_C_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+            -DCMAKE_ASM_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+            -DCMAKE_SYSTEM_NAME=Generic \
+            -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${AARCH64_LINUX_TRIPLE}" \
+            -DCOMPILER_RT_OS_DIR="linux" \
+            -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+            -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+            -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+            "${SRC_DIR}/llvm-top/compiler-rt"
+          ninja
+          ninja install
+
+      - name: Compiler-rt aarch64-baremetal
+        shell: bash
+        env:
+          RESOURCE_DIR: "${{ steps.resource_dir.outputs.value }}"
+        run: |
+          mkdir -p "${COMPILER_RT_AARCH64_BM_BUILDDIR}"
+          cd "${COMPILER_RT_AARCH64_BM_BUILDDIR}"
+          cmake -G Ninja \
+            -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+            -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+            -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+            -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+            -DCMAKE_C_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+            -DCMAKE_ASM_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+            -DCMAKE_SYSTEM_NAME=Generic \
+            -DCOMPILER_RT_BAREMETAL_BUILD=ON \
+            -DCOMPILER_RT_BUILD_BUILTINS=ON \
+            -DCOMPILER_RT_BUILD_LIBFUZZER=OFF \
+            -DCOMPILER_RT_BUILD_PROFILE=OFF \
+            -DCOMPILER_RT_BUILD_SANITIZERS=OFF \
+            -DCOMPILER_RT_BUILD_XRAY=OFF \
+            -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${AARCH64_BM_TRIPLE}" \
+            -DCOMPILER_RT_OS_DIR="baremetal" \
+            -DCOMPILER_RT_TEST_TARGET_TRIPLE="${AARCH64_BM_TRIPLE}" \
+            -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+            -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+            -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+            -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+            "${SRC_DIR}/llvm-top/compiler-rt"
+          ninja -t clean all
+          ninja
+          ninja install
+
+      - name: musl-embedded
+        shell: bash
+        run: |
+          export PATH="${INSTALL_DIR}/bin:${PATH}"
+          MUSL_BUILDDIR="${SRC_DIR}/musl-embedded"
+          source "${MUSL_BUILDDIR}/qualcomm-software/config/component_list.sh"
+          for lib in "${musl_components[@]}"; do
+            libName=$(echo "${lib}" | awk -F".sh," '{print $1}')
+            dirName=$(echo "${lib}" | awk -F"," '{print $2}')
+            mkdir -p "${MUSL_BUILDDIR}"
+            cd "${MUSL_BUILDDIR}"
+            make distclean
+            bash -x "${MUSL_BUILDDIR}/qualcomm-software/config/linux/arm/${libName}.sh" --prefix="${INSTALL_DIR}/${dirName}/libc"
+            make
+            make install
+          done
+
+      - name: c++ libs
+        shell: bash
+        run: |
+          export PATH="${INSTALL_DIR}/bin:${PATH}"
+          declare -A Triples=(
+            ["aarch64-none-elf"]="aarch64-none-elf"
+            ["aarch64-pacret-b-key-bti-none-elf"]="aarch64-none-elf"
+            ["armv7-none-eabi"]="armv7-none-eabi"
+          )
+
+          declare -A CFLAGS=(
+            ["aarch64-none-elf"]="-mcpu=cortex-a53 -nostartfiles"
+            ["aarch64-pacret-b-key-bti-none-elf"]="-mcpu=cortex-a53 -nostartfiles -march=armv8.5-a -mbranch-protection=pac-ret+leaf+b-key+bti"
+            ["armv7-none-eabi"]="-mcpu=cortex-a9 -mthumb -specs=nosys.specs"
+          )
+
+          CFLAGS_RELEASE="-Os -DNDEBUG"
+
+          for VARIANT in "aarch64-none-elf" "aarch64-pacret-b-key-bti-none-elf" "armv7-none-eabi"; do
+            TRIPLE="${Triples[${VARIANT}]}"
+            MUSL_INC="${INSTALL_DIR}/${TRIPLE}/libc/include"
+            CMAKE_CFLAGS="-target ${TRIPLE} -nostdinc -isystem ${MUSL_INC} -ccc-gcc-name ${TRIPLE}-g++ -fno-unroll-loops -fno-optimize-sibling-calls -ffunction-sections -fdata-sections -fno-exceptions -D_GNU_SOURCE ${CFLAGS[${VARIANT}]}"
+
+            mkdir -p "${BUILD_DIR}/${VARIANT}"
+            pushd "${BUILD_DIR}/${VARIANT}"
+            cmake -G Ninja \
+              -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/${VARIANT}" \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_COMPILER=clang \
+              -DCMAKE_CXX_COMPILER=clang++ \
+              -DHAVE_LIBCXXABI=True \
+              -DCMAKE_SYSTEM_NAME=Generic \
+              -DCMAKE_C_FLAGS_RELEASE="${CFLAGS_RELEASE}" \
+              -DCMAKE_CXX_FLAGS_RELEASE="${CFLAGS_RELEASE}" \
+              -DCMAKE_C_FLAGS="${CMAKE_CFLAGS}" \
+              -DCMAKE_CXX_FLAGS="${CMAKE_CFLAGS}" \
+              -DCMAKE_ASM_FLAGS="${CMAKE_CFLAGS}" \
+              -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY \
+              -DLIBCXX_ENABLE_SHARED=False \
+              -DLIBCXX_SHARED_OUTPUT_NAME="c++-shared" \
+              -DLIBCXX_ENABLE_EXCEPTIONS=False \
+              -DLIBCXX_HAS_MUSL_LIBC=True \
+              -DLIBCXX_ENABLE_ABI_LINKER_SCRIPT=False \
+              -DLIBCXX_ENABLE_THREADS=False \
+              -DLIBCXX_ENABLE_FILESYSTEM=False \
+              -DLIBCXX_ENABLE_RANDOM_DEVICE=False \
+              -DLIBCXX_ENABLE_LOCALIZATION=False \
+              -DLIBCXX_SUPPORTS_STD_EQ_CXX11_FLAG=ON \
+              -DLIBCXX_SUPPORTS_STD_EQ_CXX14_FLAG=ON \
+              -DLIBCXX_SUPPORTS_STD_EQ_CXX17_FLAG=ON \
+              -DLIBCXX_QUIC_BAREMETAL=ON \
+              -DLIBCXXABI_USE_LLVM_UNWINDER=True \
+              -DLIBCXXABI_BAREMETAL=True \
+              -DLIBCXXABI_ENABLE_SHARED=False \
+              -DLIBCXXABI_SHARED_OUTPUT_NAME="c++abi-shared" \
+              -DLIBCXXABI_ENABLE_WERROR=True \
+              -DLIBCXXABI_ENABLE_THREADS=False \
+              -DLIBCXXABI_ENABLE_ASSERTIONS=False \
+              -DLIBCXXABI_ENABLE_EXCEPTIONS=False \
+              -DLIBUNWIND_TARGET_TRIPLE="${QUIC_ARM_BAREMETAL_TRIPLE}" \
+              -DLIBUNWIND_IS_BAREMETAL=True \
+              -DLIBUNWIND_ENABLE_SHARED=False \
+              -DLIBUNWIND_SHARED_OUTPUT_NAME="unwind-shared" \
+              -DUNIX=True \
+              -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
+              -S "${SRC_DIR}/llvm-top/runtimes"
+            ninja
+            ninja install
+            popd
+          done
+
+      - name: Tar the Toolchain
+        shell: bash
+        run: |
+          cd "${INSTALL_DIR}"
+          short_sha=$(echo "${GITHUB_SHA}" | cut -c1-7)
+          tar_file="PR${{ github.event.pull_request.number }}_${short_sha}.tgz"
+          tar -czvf "../build/${tar_file}" .
+
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
deleted file mode 100644
index f7a48304b82b..000000000000
--- a/.github/workflows/premerge.yaml
+++ /dev/null
@@ -1,171 +0,0 @@
-name: CI Checks
-
-permissions:
-  contents: read
-
-on:
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      # When a PR is closed, we still start this workflow, but then skip
-      # all the jobs, which makes it effectively a no-op.  The reason to
-      # do this is that it allows us to take advantage of concurrency groups
-      # to cancel in progress CI jobs whenever the PR is closed.
-      - closed
-  push:
-    branches:
-      - 'release/**'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
-  cancel-in-progress: true
-
-jobs:
-  premerge-checks-linux:
-    name: Build and Test Linux
-    if: >-
-        github.repository_owner == 'llvm' &&
-        (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: llvm-premerge-linux-runners
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 2
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          max-size: "2000M"
-      - name: Build and Test
-        # Mark the job as a success even if the step fails so that people do
-        # not get notified while the new premerge pipeline is in an
-        # experimental state.
-        run: |
-          git config --global --add safe.directory '*'
-
-          source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py)
-
-          if [[ "${projects_to_build}" == "" ]]; then
-            echo "No projects to build"
-            exit 0
-          fi
-
-          echo "Building projects: ${projects_to_build}"
-          echo "Running project checks targets: ${project_check_targets}"
-          echo "Building runtimes: ${runtimes_to_build}"
-          echo "Running runtimes checks targets: ${runtimes_check_targets}"
-          echo "Running runtimes checks requiring reconfiguring targets: ${runtimes_check_targets_needs_reconfig}"
-
-          export CC=/opt/llvm/bin/clang
-          export CXX=/opt/llvm/bin/clang++
-
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
-      - name: Upload Artifacts
-        if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: Premerge Artifacts (Linux)
-          path: artifacts/
-          retention-days: 5
-          include-hidden-files: 'true'
-
-  premerge-checks-windows:
-    name: Build and Test Windows
-    if: >-
-        github.repository_owner == 'llvm' &&
-        (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: llvm-premerge-windows-runners
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 2
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          variant: "sccache"
-          max-size: "2000M"
-      - name: Compute Projects
-        id: vars
-        run: |
-          source <(git diff --name-only HEAD~1...HEAD | python .ci/compute_projects.py)
-
-          if [[ "${projects_to_build}" == "" ]]; then
-            echo "No projects to build"
-          fi
-
-          echo "Building projects: ${projects_to_build}"
-          echo "Running project checks targets: ${project_check_targets}"
-
-          echo "windows-projects=${projects_to_build}" >> $GITHUB_OUTPUT
-          echo "windows-check-targets=${project_check_targets}" >> $GITHUB_OUTPUT
-      - name: Build and Test
-        # Mark the job as a success even if the step fails so that people do
-        # not get notified while the new premerge pipeline is in an
-        # experimental state.
-        if: ${{ steps.vars.outputs.windows-projects != '' }}
-        shell: cmd
-        run: |
-          call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
-          bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
-      - name: Upload Artifacts
-        if: '!cancelled()'
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: Premerge Artifacts (Windows)
-          path: artifacts/
-          retention-days: 5
-          include-hidden-files: 'true'
-
-  premerge-check-macos:
-    name: MacOS Premerge Checks
-    runs-on: macos-14
-    if: >-
-      github.repository_owner == 'llvm' &&
-      (startswith(github.ref_name, 'release/') ||
-       startswith(github.base_ref, 'release/')) &&
-      (github.event_name != 'pull_request' || github.event.action != 'closed')
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 2
-      - name: Setup ccache
-        uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17
-        with:
-          max-size: "2000M"
-      - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
-      - name: Build and Test
-        run: |
-          source <(git diff --name-only HEAD~2..HEAD | python3 .ci/compute_projects.py)
-
-          if [[ "${projects_to_build}" == "" ]]; then
-            echo "No projects to build"
-            exit 0
-          fi
-
-          echo "Building projects: ${projects_to_build}"
-          echo "Running project checks targets: ${project_check_targets}"
-
-          # -DLLVM_DISABLE_ASSEMBLY_FILES=ON is for
-          # https://github.com/llvm/llvm-project/issues/81967
-          # Disable sharding in lit so that the LIT_XFAIL environment var works.
-          cmake -G Ninja \
-                -B build \
-                -S llvm \
-                -DLLVM_ENABLE_PROJECTS="${projects_to_build}" \
-                -DLLVM_DISABLE_ASSEMBLY_FILES=ON \
-                -DCMAKE_BUILD_TYPE=Release \
-                -DLLDB_INCLUDE_TESTS=OFF \
-                -DLLVM_ENABLE_ASSERTIONS=ON \
-                -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-                -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-          # The libcxx tests fail, so we are skipping the runtime targets.
-          ninja -C build ${project_check_targets}
diff --git a/.github/workflows/qcom-preflight-checks.yml b/.github/workflows/qcom-preflight-checks.yml
new file mode 100644
index 000000000000..e2a3bb8c2dc1
--- /dev/null
+++ b/.github/workflows/qcom-preflight-checks.yml
@@ -0,0 +1,26 @@
+name: Qualcomm Preflight Checks
+on:
+  # FIXME: these branch targets need to be revisited when we have a better
+  # idea of what our workflow needs to look like.
+  pull_request_target:
+    branches: [ "release/qualcomm-software/21.x" ]
+  push:
+    branches: [ "release/qualcomm-software/21.x" ]
+  workflow_dispatch:
+
+permissions:
+ contents: read
+ security-events: write
+
+jobs:
+  qcom-preflight-checks:
+    uses: qualcomm/qcom-reusable-workflows/.github/workflows/qcom-preflight-checks-reusable-workflow.yml@v1.1.4
+    with:
+        # ✅ Preflight Checkers
+        repolinter: false                  # default: true
+        semgrep: true                      # default: true
+        copyright-license-detector: true   # default: true
+        pr-check-emails: true              # default: true
+        dependency-review: true            # default: true
+    secrets:
+      SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }}
diff --git a/.github/workflows/release-asset-audit.py b/.github/workflows/release-asset-audit.py
deleted file mode 100644
index 23b901a476dc..000000000000
--- a/.github/workflows/release-asset-audit.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import github
-import re
-import sys
-
-_SPECIAL_CASE_BINARIES = {
-    "keith": {"clang+llvm-18.1.8-arm64-apple-macos11.tar.xz"},
-}
-
-
-def _is_valid(uploader_name, valid_uploaders, asset_name):
-    if uploader_name in valid_uploaders:
-        return True
-
-    if uploader_name in _SPECIAL_CASE_BINARIES:
-        return asset_name in _SPECIAL_CASE_BINARIES[uploader_name]
-
-    return False
-
-
-def _get_uploaders(release_version):
-    # Until llvm 18, assets were uploaded by community members, the release managers
-    # and the GitHub Actions bot.
-    if release_version <= 18:
-        return set(
-            [
-                "DimitryAndric",
-                "stefanp-synopsys",
-                "lei137",
-                "omjavaid",
-                "nicolerabjohn",
-                "amy-kwan",
-                "mandlebug",
-                "zmodem",
-                "androm3da",
-                "tru",
-                "rovka",
-                "rorth",
-                "quinnlp",
-                "kamaub",
-                "abrisco",
-                "jakeegan",
-                "maryammo",
-                "tstellar",
-                "github-actions[bot]",
-            ]
-        )
-    # llvm 19 and beyond, only the release managers, bot and a much smaller
-    # number of community members.
-    elif release_version >= 19:
-        return set(
-            [
-                "zmodem",
-                "omjavaid",
-                "tru",
-                "tstellar",
-                "github-actions[bot]",
-            ]
-        )
-
-
-def _get_major_release_version(release_title):
-    # All release titles are of the form "LLVM X.Y.Z(-rcN)".
-    match = re.match("LLVM ([0-9]+)\.", release_title)
-    if match is None:
-        _write_comment_and_exit_with_error(
-            f'Could not parse release version from release title "{release_title}".'
-        )
-    else:
-        return int(match.groups()[0])
-
-
-def _write_comment_and_exit_with_error(comment):
-    with open("comment", "w") as file:
-        file.write(comment)
-    sys.exit(1)
-
-
-def main():
-    token = sys.argv[1]
-
-    gh = github.Github(login_or_token=token)
-    repo = gh.get_repo("llvm/llvm-project")
-
-    for release in repo.get_releases():
-        print("Release:", release.title)
-        uploaders = _get_uploaders(_get_major_release_version(release.title))
-        for asset in release.get_assets():
-            created_at = asset.created_at
-            updated_at = (
-                "" if asset.created_at == asset.updated_at else asset.updated_at
-            )
-            print(
-                f"{asset.name} : {asset.uploader.login} [{created_at} {updated_at}] ( {asset.download_count} )"
-            )
-            if not _is_valid(asset.uploader.login, uploaders, asset.name):
-                _write_comment_and_exit_with_error(
-                    f"@{asset.uploader.login} is not a valid uploader."
-                )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
deleted file mode 100644
index 8112d8a14081..000000000000
--- a/.github/workflows/release-asset-audit.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: Release Asset Audit
-
-on:
-  workflow_dispatch:
-  release:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    # Run once an hour
-    - cron:  '5 * * * *'
-
-  pull_request:
-    paths:
-      - ".github/workflows/release-asset-audit.py"
-      - ".github/workflows/release-asset-audit.yml"
-
-permissions:
-  contents: read # Default everything to read-only
-
-jobs:
-  audit:
-    name: "Release Asset Audit"
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 #v4.1.6
-      - name: "Run Audit Script"
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-        run: |
-          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-          python3 ./.github/workflows/release-asset-audit.py $GITHUB_TOKEN
-      - name: "File Issue"
-        if: >-
-          github.event_name != 'pull_request' &&
-          failure()
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
-        with:
-          github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
-          script: |
-            var fs = require('fs');
-            var body = ''
-            if (fs.existsSync('./comment')) {
-              body = fs.readFileSync('./comment') + "\n\n";
-            }
-            body = body + `\n\nhttps://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`
-
-            const issue = await github.rest.issues.create({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               title: "Release Asset Audit Failed",
-               labels: ['infrastructure'],
-               body: body
-            });
-            console.log(issue);
diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
deleted file mode 100644
index 0b52a08202f1..000000000000
--- a/.github/workflows/release-binaries-all.yml
+++ /dev/null
@@ -1,105 +0,0 @@
-name: Release Binaries All
-
-permissions:
-  contents: read # Default everything to read-only
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload binaries to the release page'
-        required: true
-        default: false
-        type: boolean
-
-  workflow_call:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload binaries to the release page'
-        required: true
-        default: false
-        type: boolean
-    secrets:
-      RELEASE_TASKS_USER_TOKEN:
-        description: "Secret used to check user permissions."
-        required: false
-
-  pull_request:
-    types:
-      - opened
-      - synchronize
-      - reopened
-      # When a PR is closed, we still start this workflow, but then skip
-      # all the jobs, which makes it effectively a no-op.  The reason to
-      # do this is that it allows us to take advantage of concurrency groups
-      # to cancel in progress CI jobs whenever the PR is closed.
-      - closed
-    paths:
-      - '.github/workflows/release-binaries-all.yml'
-      - '.github/workflows/release-binaries.yml'
-      - '.github/workflows/release-binaries-setup-stage/*'
-      - '.github/workflows/release-binaries-save-stage/*'
-      - 'clang/cmake/caches/Release.cmake'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || 'dispatch' }}
-  cancel-in-progress: True
-
-jobs:
-  setup-variables:
-    if: >-
-      (github.event_name != 'pull_request' || github.event.action != 'closed')
-    runs-on: ubuntu-24.04
-    outputs:
-      release-version: ${{ steps.vars.outputs.release-version }}
-      upload: ${{ steps.vars.outputs.upload }}
-    steps:
-      - shell: bash
-        id: vars
-        run: |
-          upload="${{ inputs.upload }}"
-          release_version="${{ inputs.release-version }}"
-          if [ "${{ github.event_name }}" = "pull_request" ]; then
-            upload="false"
-            release_version=""
-          fi
-          echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
-          echo "upload=$upload" >> "$GITHUB_OUTPUT"
-
-  release-binaries-all:
-    name: Build Release Binaries
-    needs:
-      - setup-variables
-    permissions:
-      contents: write # For release uploads
-      id-token: write     # For artifact attestations
-      attestations: write # For artifact attestations
-    strategy:
-      fail-fast: false
-      matrix:
-        # We use ubuntu-22.04 rather than the latest version to make the built
-        # binaries more portable (eg functional aginast older glibc).
-        runs-on:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-          - macos-13
-          - macos-14
-
-    uses: ./.github/workflows/release-binaries.yml
-    with:
-      release-version: "${{ needs.setup-variables.outputs.release-version }}"
-      upload: ${{ needs.setup-variables.outputs.upload == 'true'}}
-      runs-on: "${{ matrix.runs-on }}"
-    secrets:
-      # This will be empty for pull_request events, but that's fine, because
-      # the release-binaries workflow does not use this secret for the
-      # pull_request event.
-      RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml
deleted file mode 100644
index f08088c7bc56..000000000000
--- a/.github/workflows/release-binaries-save-stage/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-name: Save Stage
-description: >-
-  Upload the source and binary directories from a build stage so that they
-  can be re-used in the next stage.  This action is used to the release
-  binaries workflow into multiple stages to avoid the 6 hour timeout on
-  the GitHub hosted runners.
-inputs:
-  build-prefix:
-    description: "Directory containing the build directory."
-    required: true
-    type: 'string'
-
-permissions:
-  contents: read
-
-runs:
-  using: "composite"
-  steps:
-    # We need to create an archive of the build directory, because it has too
-    # many files to upload.
-    - name: Package Build and Source Directories
-      shell: bash
-      run: |
-        # Remove .git/config to avoid leaking GITHUB_TOKEN stored there.
-        # See https://unit42.paloaltonetworks.com/github-repo-artifacts-leak-tokens/
-        rm -Rf .git/config
-        # Windows does not support symlinks, so we need to dereference them.
-        tar --exclude build/ ${{ (runner.os == 'Windows' && '-h') || '' }} -c . | zstd -T0 -c > ../llvm-project.tar.zst
-        mv ../llvm-project.tar.zst .
-        tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst
-
-    - name: Upload Stage 1 Source
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source
-        path: llvm-project.tar.zst
-        retention-days: 2
-
-    - name: Upload Stage 1 Build Dir
-      uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build
-        path: build.tar.zst
-        retention-days: 2
diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml
deleted file mode 100644
index f5e5db27e659..000000000000
--- a/.github/workflows/release-binaries-setup-stage/action.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Setup Stage
-description: >-
-  Setup the next stage of the release binaries workflow.  This sets up the
-  environment correctly for a new stage of the release binaries workflow
-  and also restores the source and build directory from the previous stage.
-
-inputs:
-  previous-artifact:
-    description: >-
-      A unique descriptor for the artifact from the previous stage.  This will
-      be used to construct the final artifact pattern, which is:
-      $RUNNER_OS-$RUNNER_ARCH-$PREVIOUS_ARTIFACT-*
-    required: false
-    type: 'string'
-
-outputs:
-  build-prefix:
-    description: "Directory containing the build directory."
-    value: ${{ steps.build-prefix.outputs.build-prefix }}
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main
-   
-    - name: Setup Windows
-      if: startsWith(runner.os, 'Windows')
-      uses: llvm/actions/setup-windows@main
-      with:
-        arch: amd64
-
-    - name: Set Build Prefix
-      id: build-prefix
-      shell: bash
-      run: |
-        build_prefix=`pwd`
-        if [ "${{ runner.os }}" = "Linux" ]; then
-          sudo chown $USER:$USER /mnt/
-          build_prefix=/mnt/
-        fi
-        echo "build-prefix=$build_prefix" >> $GITHUB_OUTPUT
-
-    - name: Download Previous Stage Artifact
-      if: ${{ inputs.previous-artifact }}
-      id: download
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
-      with:
-        pattern: ${{ runner.os }}-${{ runner.arch }}-${{ inputs.previous-artifact }}-*
-        merge-multiple: true
-
-    - name: Unpack Artifact
-      if: ${{ steps.download.outputs.download-path }}
-      shell: bash
-      run: |
-        tar --zstd -xf llvm-project.tar.zst
-        rm llvm-project.tar.zst
-        tar --zstd -C ${{ steps.build-prefix.outputs.build-prefix}} -xf build.tar.zst
-        rm build.tar.zst
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
deleted file mode 100644
index c113b42dc8ed..000000000000
--- a/.github/workflows/release-binaries.yml
+++ /dev/null
@@ -1,357 +0,0 @@
-name: Release Binaries
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: false
-        type: string
-      upload:
-        description: 'Upload binaries to the release page'
-        required: true
-        default: false
-        type: boolean
-      runs-on:
-        description: "Runner to use for the build"
-        required: true
-        type: choice
-        # We use ubuntu-22.04 rather than the latest version to make the built
-        # binaries more portable (eg functional aginast older glibc).
-        options:
-          - ubuntu-22.04
-          - ubuntu-22.04-arm
-          - macos-13
-          - macos-14
-
-  workflow_call:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: false
-        type: string
-      upload:
-        description: 'Upload binaries to the release page'
-        required: true
-        default: false
-        type: boolean
-      runs-on:
-        description: "Runner to use for the build"
-        required: true
-        type: string
-    secrets:
-      RELEASE_TASKS_USER_TOKEN:
-        description: "Secret used to check user permissions."
-        required: false
-
-
-permissions:
-  contents: read # Default everything to read-only
-
-jobs:
-  prepare:
-    name: Prepare to build binaries
-    runs-on: ${{ inputs.runs-on }}
-    if: github.repository_owner == 'llvm'
-    outputs:
-      release-version: ${{ steps.vars.outputs.release-version }}
-      ref: ${{ steps.vars.outputs.ref }}
-      upload: ${{ steps.vars.outputs.upload }}
-      target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
-      ccache: ${{ steps.vars.outputs.ccache }}
-      build-flang: ${{ steps.vars.outputs.build-flang }}
-      release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
-      release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
-      build-runs-on: ${{ steps.vars.outputs.build-runs-on }}
-      test-runs-on: ${{ steps.vars.outputs.build-runs-on }}
-
-    steps:
-    # It's good practice to use setup-python, but this is also required on macos-14
-    # due to https://github.com/actions/runner-images/issues/10385
-    - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f
-      with:
-        python-version: '3.12'
-
-    - name: Checkout LLVM
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install Dependencies
-      shell: bash
-      run: |
-        pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-
-    - name: Check Permissions
-      if: github.event_name != 'pull_request'
-      env:
-        GITHUB_TOKEN: ${{ github.token }}
-        USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-      shell: bash
-      run: |
-        ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user "$GITHUB_ACTOR" --user-token "$USER_TOKEN" check-permissions
-
-    - name: Collect Variables
-      id: vars
-      shell: bash
-      # In order for the test-release.sh script to run correctly, the LLVM
-      # source needs to be at the following location relative to the build dir:
-      # | X.Y.Z-rcN | ./rcN/llvm-project
-      # | X.Y.Z     | ./final/llvm-project
-      #
-      # We also need to set divergent flags based on the release version:
-      # | X.Y.Z-rcN | -rc N -test-asserts
-      # | X.Y.Z     | -final
-      run: |
-        trimmed=$(echo ${{ inputs.release-version }} | xargs)
-        if [ -n "$trimmed" ]; then
-          release_version="$trimmed"
-          ref="llvmorg-$release_version"
-        else
-          release_version="${{ (github.event_name == 'pull_request' && format('PR{0}', github.event.pull_request.number)) || 'CI'}}-$GITHUB_SHA"
-          ref="$GITHUB_SHA"
-        fi
-        if [ -n "${{ inputs.upload }}" ]; then
-          upload="${{ inputs.upload }}"
-        else
-          upload="false"
-        fi
-        echo "release-version=$release_version">> $GITHUB_OUTPUT
-        echo "ref=$ref" >> $GITHUB_OUTPUT
-        echo "upload=$upload" >> $GITHUB_OUTPUT
-
-        release_binary_basename="LLVM-$release_version-$RUNNER_OS-$RUNNER_ARCH"
-        echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
-        echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
-
-        target="$RUNNER_OS-$RUNNER_ARCH"
-        # The hendrikmuhs/ccache-action action does not support installing sccache
-        # on arm64 Linux.
-        if [ "$target" = "Linux-ARM64" ]; then
-          echo ccache=ccache >> $GITHUB_OUTPUT
-        else
-          echo ccache=sccache >> $GITHUB_OUTPUT
-        fi
-
-        # The macOS builds try to cross compile some libraries so we need to
-        # add extra CMake args to disable them.
-        # See https://github.com/llvm/llvm-project/issues/99767
-        if [ "$RUNNER_OS" = "macOS" ]; then
-          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_COMPILER_RT_ENABLE_IOS=OFF"
-          if [ "$RUNNER_ARCH" = "ARM64" ]; then
-            arches=arm64
-          else
-            arches=x86_64
-            # Disable Flang builds on macOS x86_64.  The FortranLower library takes
-            # 2-3 hours to build on macOS, much slower than on Linux.
-            # The long build time causes the release build to time out on x86_64,
-            # so we need to disable flang there.
-            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_PROJECTS='clang;lld;lldb;clang-tools-extra;polly;mlir'"
-          fi
-          target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
-        fi
-
-        build_flang="true"
-
-        if [ "$RUNNER_OS" = "Windows" ]; then
-          # The build times out on Windows, so we need to disable LTO.
-          target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
-        fi
-
-        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
-        echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
-        case "${{ inputs.runs-on }}" in
-          ubuntu-22.04*)
-            build_runs_on="depot-${{ inputs.runs-on }}-16"
-            test_runs_on=$build_runs_on
-            ;;
-          macos-13)
-            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
-              build_runs_on="${{ inputs.runs-on }}"
-            else
-              build_runs_on="macos-13-large"
-            fi
-            test_runs_on="${{ inputs.runs-on }}"
-            ;;
-          macos-14)
-            if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
-              build_runs_on="${{ inputs.runs-on }}"
-            else
-              build_runs_on="depot-macos-14"
-            fi
-            test_runs_on="${{ inputs.runs-on }}"
-            ;;
-          *)
-            test_runs_on="${{ inputs.runs-on }}"
-            build_runs_on=$test_runs_on
-            ;;
-        esac
-        echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT
-        echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT
-
-  build-release-package:
-    name: "Build Release Package"
-    needs: prepare
-    if: github.repository_owner == 'llvm'
-    runs-on: ${{ needs.prepare.outputs.build-runs-on }}
-    steps:
-
-    - name: Checkout Actions
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        # Check out outside of working directory so the source checkout doesn't
-        # remove it.
-        path: workflows
-
-    # actions/checkout does not support paths outside of the GITHUB_WORKSPACE.
-    # Also, anything that we put inside of GITHUB_WORKSPACE will be overwritten
-    # by future actions/checkout steps.  Therefore, in order to checkout the
-    # latest actions from main, we need to first checkout out the actions inside of
-    # GITHUB_WORKSPACE (see previous step), then use actions/checkout to checkout
-    # the code being built and the move the actions from main back into GITHUB_WORKSPACE,
-    # becasue the uses on composite actions only reads workflows from inside GITHUB_WORKSPACE.
-    - shell: bash
-      run: mv workflows  ../workflows-main
-
-    - name: Checkout LLVM
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        ref: ${{ needs.prepare.outputs.ref }}
-
-    - name: Copy main workflows
-      shell: bash
-      run: |
-        mv ../workflows-main .
-
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows-main/.github/workflows/release-binaries-setup-stage
-
-    - name: Configure
-      id: build
-      shell: bash
-      env:
-        CCACHE_BIN: ${{ needs.prepare.outputs.ccache }}
-      run: |
-        # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
-        # so we need to set some extra cmake flags to disable this.
-        cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
-            ${{ needs.prepare.outputs.target-cmake-flags }} \
-            -C clang/cmake/caches/Release.cmake \
-            -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
-            -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
-
-    - name: Build
-      shell: bash
-      run: |
-        ninja -v -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-package
-        release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'`
-        mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} .
-    
-    - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0
-      with:
-        name: ${{ runner.os }}-${{ runner.arch }}-release-binary
-        # Due to path differences on Windows when running in bash vs running on node,
-        # we need to search for files in the current workspace.
-        path: |
-          ${{ needs.prepare.outputs.release-binary-filename }}
-
-    # Clean up some build files to reduce size of artifact.
-    - name: Clean Up Build Directory
-      shell: bash
-      run: |
-        find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname ${{ needs.prepare.outputs.release-binary-filename }} -delete
-        find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname _CPack_Packages -prune -exec rm -r {} +
-    
-    - name: Save Stage
-      uses: ./workflows-main/.github/workflows/release-binaries-save-stage
-      with:
-        build-prefix: ${{ steps.setup-stage.outputs.build-prefix }}
-
-  upload-release-binaries:
-    name: "Upload Release Binaries"
-    needs:
-      - prepare
-      - build-release-package
-    if: >-
-      github.event_name != 'pull_request' &&
-      needs.prepare.outputs.upload == 'true'
-    runs-on: ubuntu-24.04
-    permissions:
-      contents: write # For release uploads
-      id-token: write     # For artifact attestations
-      attestations: write # For artifact attestations
-
-    steps:
-    - name: Checkout Release Scripts
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        sparse-checkout: |
-          llvm/utils/release/github-upload-release.py
-          llvm/utils/git/requirements.txt
-        sparse-checkout-cone-mode: false
-
-    - name: 'Download artifact'
-      uses: actions/download-artifact@6b208ae046db98c579e8a3aa621ab581ff575935 # v4.1.1
-      with:
-        pattern: '*-release-binary'
-        merge-multiple: true
-
-    - name: Attest Build Provenance
-      id: provenance
-      uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
-      with:
-        subject-path: ${{ needs.prepare.outputs.release-binary-filename }}
-
-    - name: Rename attestation file
-      run:
-        mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
-
-    - name: Upload Build Provenance
-      uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
-      with:
-        name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation
-        path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl
-
-    - name: Install Python Requirements
-      run: |
-        pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-
-    - name: Upload Release
-      shell: bash
-      run: |
-        ./llvm/utils/release/github-upload-release.py \
-        --token ${{ github.token }} \
-        --release ${{ needs.prepare.outputs.release-version }} \
-        upload \
-        --files ${{ needs.prepare.outputs.release-binary-filename }}*
-
-  test-release:
-    name: "Test Release"
-    needs:
-      - prepare
-      - build-release-package
-    if: >-
-      github.repository_owner == 'llvm'
-    runs-on: ${{ needs.prepare.outputs.test-runs-on }}
-    steps:
-    - name: Checkout Actions
-      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      with:
-        ref: ${{ (github.event_name == 'pull_request' && github.sha) || 'main' }}
-        sparse-checkout: |
-          .github/workflows/
-        sparse-checkout-cone-mode: false
-        path: workflows
-    - name: Setup Stage
-      id: setup-stage
-      uses: ./workflows/.github/workflows/release-binaries-setup-stage
-      with:
-        previous-artifact: build-release-package
-
-    - name: Run Tests
-      shell: bash
-      run: |
-        ninja -C ${{ steps.setup-stage.outputs.build-prefix }}/build stage2-check-all
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
deleted file mode 100644
index 5a0aa063d32a..000000000000
--- a/.github/workflows/release-documentation.yml
+++ /dev/null
@@ -1,91 +0,0 @@
-name: Release Documentation
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload documentation'
-        required: false
-        type: boolean
-
-  workflow_call:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload documentation'
-        required: false
-        type: boolean
-
-jobs:
-  release-documentation:
-    name: Build and Upload Release Documentation
-    runs-on: ubuntu-24.04
-    env:
-      upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          cache: 'pip'
-          cache-dependency-path: './llvm/docs/requirements.txt'
-
-      - name: Install Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-              graphviz \
-              python3-github \
-              ninja-build \
-              texlive-font-utils
-          pip3 install --user -r ./llvm/docs/requirements.txt
-
-      - name: Build Documentation
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-        run: |
-          ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
-
-      - name: Create Release Notes Artifact
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0
-        with:
-          name: release-notes
-          path: docs-build/html-export/
-
-      - name: Clone www-releases
-        if: env.upload
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          repository: ${{ github.repository_owner }}/www-releases
-          ref: main
-          fetch-depth: 0
-          path: www-releases
-          persist-credentials: false
-
-      - name: Upload Release Notes
-        if: env.upload
-        env:
-          GH_TOKEN: ${{ secrets.WWW_RELEASES_TOKEN }}
-        run: |
-          mkdir -p www-releases/${{ inputs.release-version }}
-          mv ./docs-build/html-export/* www-releases/${{ inputs.release-version }}
-          cd www-releases
-          git checkout -b ${{ inputs.release-version }}
-          git add ${{ inputs.release-version }}
-          git config user.email "llvmbot@llvm.org"
-          git config user.name "llvmbot"
-          git commit -a -m "Add ${{ inputs.release-version }} documentation"
-          git push --force  "https://$GH_TOKEN@github.com/llvmbot/www-releases.git" HEAD:refs/heads/${{ inputs.release-version }}
-          gh pr create -f -B main -H ${{ inputs.release-version }} -R llvmbot/www-releases
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
deleted file mode 100644
index d47c4337c07b..000000000000
--- a/.github/workflows/release-doxygen.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: Release Doxygen
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload documentation'
-        required: false
-        type: boolean
-
-  workflow_call:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-      upload:
-        description: 'Upload documentation'
-        required: false
-        type: boolean
-    secrets:
-      RELEASE_TASKS_USER_TOKEN:
-        description: "Secret used to check user permissions."
-        required: false
-
-jobs:
-  release-doxygen:
-    name: Build and Upload Release Doxygen
-    runs-on: ubuntu-24.04
-    permissions:
-      contents: write
-    env:
-      upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Setup Python env
-        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          cache: 'pip'
-          cache-dependency-path: './llvm/docs/requirements.txt'
-
-      - name: Install Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y \
-              doxygen \
-              graphviz \
-              python3-github \
-              ninja-build \
-              texlive-font-utils
-          pip3 install --user -r ./llvm/docs/requirements.txt
-
-      - name: Build Doxygen
-        run: |
-          ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-sphinx
-
-      - name: Upload Doxygen
-        if: env.upload
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-        run: |
-          ./llvm/utils/release/github-upload-release.py --token "$GITHUB_TOKEN" --release "${{ inputs.release-version }}" --user "${{ github.actor }}" --user-token "$USER_TOKEN" upload --files ./*doxygen*.tar.xz
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
deleted file mode 100644
index 9adeffb74d52..000000000000
--- a/.github/workflows/release-lit.yml
+++ /dev/null
@@ -1,79 +0,0 @@
-name: Release Lit
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-
-  workflow_call:
-    inputs:
-      release-version:
-        description: 'Release Version'
-        required: true
-        type: string
-    secrets:
-      RELEASE_TASKS_USER_TOKEN:
-        description: "Secret used to check user permissions."
-        required: false
-
-jobs:
-  release-lit:
-    name: Release Lit
-    runs-on: ubuntu-24.04
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          ref: "llvmorg-${{ inputs.release-version }}"
-
-      - name: Install dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y python3-setuptools python3-psutil python3-github
-
-      - name: Check Permissions
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-        run: |
-          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
-
-      - name: Setup Cpp
-        uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1
-        with:
-          compiler: llvm-16.0.6
-          cmake: true
-          ninja: true
-
-      - name: Test lit
-        run: |
-          mkdir build && cd build
-          export FILECHECK_OPTS='-dump-input-filter=all -vv -color'
-          cmake ../llvm -DCMAKE_BUILD_TYPE=Release -G Ninja
-          ninja -v -j $(nproc) check-lit
-
-      - name: Package lit
-        run: |
-          cd llvm/utils/lit
-          # Remove 'dev' suffix from lit version.
-          sed -i 's/ + "dev"//g' lit/__init__.py
-          python3 setup.py sdist bdist_wheel
-
-      - name: Upload lit to test.pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
-        with:
-          password: ${{ secrets.LLVM_LIT_TEST_PYPI_API_TOKEN }}
-          repository-url: https://test.pypi.org/legacy/
-          packages-dir: llvm/utils/lit/dist/
-
-      - name: Upload lit to pypi.org
-        uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
-        with:
-          password: ${{ secrets.LLVM_LIT_PYPI_API_TOKEN }}
-          packages-dir: llvm/utils/lit/dist/
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
deleted file mode 100644
index 99438918b56f..000000000000
--- a/.github/workflows/release-sources.yml
+++ /dev/null
@@ -1,108 +0,0 @@
-name: Release Sources
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-    inputs:
-      release-version:
-        description: Release Version
-        required: true
-        type: string
-  workflow_call:
-    inputs:
-      release-version:
-        description: Release Version
-        required: true
-        type: string
-    secrets:
-      RELEASE_TASKS_USER_TOKEN:
-        description: "Secret used to check user permissions."
-        required: false
-  # Run on pull_requests for testing purposes.
-  pull_request:
-    paths:
-      - '.github/workflows/release-sources.yml'
-    types:
-      - opened
-      - synchronize
-      - reopened
-      # When a PR is closed, we still start this workflow, but then skip
-      # all the jobs, which makes it effectively a no-op.  The reason to
-      # do this is that it allows us to take advantage of concurrency groups
-      # to cancel in progress CI jobs whenever the PR is closed.
-      - closed
-
-concurrency:
-  group: ${{ github.workflow }}-${{ inputs.release-version || github.event.pull_request.number }}
-  cancel-in-progress: True
-
-jobs:
-  inputs:
-    name: Collect Job Inputs
-    if: >-
-      github.repository_owner == 'llvm' &&
-      github.event.action != 'closed'
-    outputs:
-      ref: ${{ steps.inputs.outputs.ref }}
-      export-args: ${{ steps.inputs.outputs.export-args }}
-    runs-on: ubuntu-24.04
-    steps:
-      - id: inputs
-        run: |
-          ref=${{ (inputs.release-version && format('llvmorg-{0}', inputs.release-version)) || github.sha }}
-          if [ -n "${{ inputs.release-version }}" ]; then
-            export_args="-release ${{ inputs.release-version }} -final"
-          else
-            export_args="-git-ref ${{ github.sha }}"
-          fi
-          echo "ref=$ref" >> $GITHUB_OUTPUT
-          echo "export-args=$export_args" >> $GITHUB_OUTPUT
-
-  release-sources:
-    name: Package Release Sources
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    needs:
-      - inputs
-    permissions:
-      id-token: write
-      attestations: write
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          ref: ${{ needs.inputs.outputs.ref }}
-          fetch-tags: true
-      - name: Install Dependencies
-        run: |
-          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-
-      - name: Check Permissions
-        if: github.event_name != 'pull_request'
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-        run: |
-          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions
-      - name: Create Tarballs
-        run: |
-          ./llvm/utils/release/export.sh ${{ needs.inputs.outputs.export-args }}
-      - name: Attest Build Provenance
-        if: github.event_name != 'pull_request'
-        id: provenance
-        uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0
-        with:
-          subject-path: "*.xz"
-      - if: github.event_name != 'pull_request'
-        run: |
-          mv ${{ steps.provenance.outputs.bundle-path }} .
-      - name: Create Tarball Artifacts
-        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3
-        with:
-          path: |
-            *.xz
-            attestation.jsonl
-
-
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
deleted file mode 100644
index d55098345d89..000000000000
--- a/.github/workflows/release-tasks.yml
+++ /dev/null
@@ -1,113 +0,0 @@
-name: Release Task
-
-permissions:
-  contents: read
-
-on:
-  push:
-    tags:
-      # The regex support here is limited, so just match everything that starts with llvmorg- and filter later.
-      - 'llvmorg-*'
-
-jobs:
-  validate-tag:
-    name: Validate Tag
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    outputs:
-      release-version: ${{ steps.validate-tag.outputs.release-version }}
-    steps:
-      - name: Validate Tag
-        id: validate-tag
-        run: |
-          echo "${{ github.ref_name }}" | grep -e '^llvmorg-[0-9]\+\.[0-9]\+\.[0-9]\+\(-rc[0-9]\+\)\?$'
-          release_version=$(echo "${{ github.ref_name }}" | sed 's/llvmorg-//g')
-          echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
-
-  release-create:
-    name: Create a New Release
-    runs-on: ubuntu-24.04
-    permissions:
-      contents: write # For creating the release.
-    needs: validate-tag
-
-    steps:
-      - name: Install Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install python3-github
-
-      - name: Checkout LLVM
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Create Release
-        env:
-          GITHUB_TOKEN: ${{ github.token }}
-          USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-        run: |
-          ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --release ${{ needs.validate-tag.outputs.release-version }} --user ${{ github.actor }} --user-token "$USER_TOKEN" create
-  release-documentation:
-    name: Build and Upload Release Documentation
-    needs:
-      - validate-tag
-    uses: ./.github/workflows/release-documentation.yml
-    with:
-      release-version: ${{ needs.validate-tag.outputs.release-version }}
-      upload: true
-
-  release-doxygen:
-    name: Build and Upload Release Doxygen
-    permissions:
-      contents: write
-    needs:
-      - validate-tag
-      - release-create
-    uses: ./.github/workflows/release-doxygen.yml
-    with:
-      release-version: ${{ needs.validate-tag.outputs.release-version }}
-      upload: true
-    # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
-    secrets:
-      RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-
-  release-lit:
-    name: Release Lit
-    needs: validate-tag
-    uses: ./.github/workflows/release-lit.yml
-    with:
-      release-version: ${{ needs.validate-tag.outputs.release-version }}
-    # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
-    secrets:
-      RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-
-  release-binaries:
-    name: Build Release Binaries
-    permissions:
-      contents: write
-      id-token: write
-      attestations: write
-    needs:
-      - validate-tag
-      - release-create
-    uses: ./.github/workflows/release-binaries-all.yml
-    with:
-      release-version: ${{ needs.validate-tag.outputs.release-version }}
-      upload: true
-    # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
-    secrets:
-      RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
-
-  release-sources:
-    name: Package Release Sources
-    permissions:
-      contents: read
-      id-token: write
-      attestations: write
-    needs:
-      - validate-tag
-    uses: ./.github/workflows/release-sources.yml
-    with:
-      release-version: ${{ needs.validate-tag.outputs.release-version }}
-    # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
-    secrets:
-      RELEASE_TASKS_USER_TOKEN: ${{ secrets.RELEASE_TASKS_USER_TOKEN }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
deleted file mode 100644
index 6cc80fb316c6..000000000000
--- a/.github/workflows/scorecard.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-# This workflow uses actions that are not certified by GitHub. They are provided
-# by a third-party and are governed by separate terms of service, privacy
-# policy, and support documentation.
-
-# Check current LLVM-Project results here: https://securityscorecards.dev/viewer/?uri=github.com/llvm/llvm-project
-
-name: Scorecard supply-chain security
-on:
-  # For Branch-Protection check. Only the default branch is supported. See
-  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
-  branch_protection_rule:
-  # To guarantee Maintained check is occasionally updated. See
-  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
-  schedule:
-    - cron: '38 20 * * *'
-
-# Declare default permissions as read only.
-permissions:
-  contents: read
-
-jobs:
-  analysis:
-    name: Scorecard analysis
-    runs-on: ubuntu-24.04
-    if: github.repository == 'llvm/llvm-project'
-    permissions:
-      # Needed to upload the results to code-scanning dashboard.
-      security-events: write
-      # Needed to publish results and get a badge (see publish_results below).
-      id-token: write      
-
-    steps:
-      - name: "Checkout code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          persist-credentials: false
-
-      - name: "Run analysis"
-        uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
-        with:
-          results_file: results.sarif
-          results_format: sarif
-
-          #   - Publish results to OpenSSF REST API for easy access by consumers
-          #   - Allows the repository to include the Scorecard badge.
-          #   - See https://github.com/ossf/scorecard-action#publishing-results.      
-          publish_results: true
-
-      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
-      # format to the repository Actions tab.
-      - name: "Upload artifact"
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
-        with:
-          name: SARIF file
-          path: results.sarif
-          retention-days: 5
-
-      # Upload the results to GitHub's code scanning dashboard.
-      - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@80f993039571a6de66594ecaa432875a6942e8e0 # v2.20.6
-        with:
-          sarif_file: results.sarif
diff --git a/.github/workflows/set-release-binary-outputs.sh b/.github/workflows/set-release-binary-outputs.sh
deleted file mode 100644
index 14d0798364e9..000000000000
--- a/.github/workflows/set-release-binary-outputs.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-# Usage: set-release-binary-outputs.sh <github_user> <tag> <upload>
-
-set -e
-
-if [ -z "$GITHUB_OUTPUT" ]; then
-  export GITHUB_OUTPUT=`mktemp`
-  echo "Warning: Environment variable GITHUB_OUTPUT is not set."
-  echo "Writing output variables to $GITHUB_OUTPUT"
-fi
-
-tag=$1
-upload=$2
-
-if echo $tag | grep -e '^[0-9a-f]\+$'; then
-  # This is a plain commit.
-  # TODO: Don't hardcode this.
-  release_version="18"
-  upload='false'
-  ref="$tag"
-
-else
-
-  pattern='^llvmorg-[0-9]\+\.[0-9]\+\.[0-9]\+\(-rc[0-9]\+\)\?$'
-  echo "$tag" | grep -e $pattern
-  if [ $? != 0 ]; then
-    echo "ERROR: Tag '$tag' doesn't match pattern: $pattern"
-    exit 1
-  fi
-  release_version=`echo "$tag" | sed 's/llvmorg-//g'`
-  release=`echo "$release_version" | sed 's/-.*//g'`
-fi
-echo "release-version=$release_version" >> $GITHUB_OUTPUT
-echo "upload=$upload" >> $GITHUB_OUTPUT
-echo "ref=$tag" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml
deleted file mode 100644
index f15ca1cb64ba..000000000000
--- a/.github/workflows/spirv-tests.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: SPIR-V Tests
-
-permissions:
-  contents: read
-
-on:
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - 'llvm/lib/Target/SPIRV/**'
-      - 'llvm/test/CodeGen/SPIRV/**'
-      - '.github/workflows/spirv-tests.yml'
-
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
-jobs:
-  check_spirv:
-    if: github.repository_owner == 'llvm'
-    name: Test SPIR-V
-    uses: ./.github/workflows/llvm-project-tests.yml
-    with:
-      build_target: check-llvm-codegen-spirv
-      projects:
-      extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="SPIRV" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON'
-      os_list: '["ubuntu-24.04"]'
diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml
deleted file mode 100644
index 9d8fb59a67c0..000000000000
--- a/.github/workflows/unprivileged-download-artifact/action.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: Unprivileged Download Artifact
-description: >-
-  Download artifacts from another workflow run without using an access token.
-inputs:
-  run-id:
-    description: >-
-      The run-id for the workflow run that you want to download the artifact
-      from.  If ommitted it will download the most recently created artifact
-      from the repo with the artifact-name.
-    required: false
-  artifact-name:
-    desciption: The name of the artifact to download.
-    required: true
-
-
-outputs:
-  filename:
-    description: >-
-      The filename of the downloaded artifact or the empty string if the
-      artifact was not found.
-    value: ${{ steps.download-artifact.outputs.filename }}
-  artifact-id:
-    description: "The id of the artifact being downloaded."
-    value: ${{ steps.artifact-url.outputs.id }}
-
-
-runs:
-  using: "composite"
-  steps:
-    - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
-      id: artifact-url
-      with:
-        script: |
-          var response;
-          if (!"${{ inputs.run-id }}") {
-            response = await github.rest.actions.listArtifactsForRepo({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              name: "${{ inputs.artifact-name }}"
-            })
-          } else {
-            response = await github.rest.actions.listWorkflowRunArtifacts({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              run_id: "${{ inputs.run-id }}",
-              name: "${{ inputs.artifact-name }}"
-            })
-          }
-
-          console.log(response)
-
-          for (artifact of response.data.artifacts) {
-            console.log(artifact);
-          }
-
-          if (response.data.artifacts.length == 0) {
-            console.log("Could not find artifact ${{ inputs.artifact-name }} for workflow run ${{ inputs.run-id }}")
-            return;
-          }
-
-          const url_response = await github.rest.actions.downloadArtifact({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            artifact_id: response.data.artifacts[0].id,
-            archive_format: "zip"
-          })
-
-          core.setOutput("url", url_response.url);
-          core.setOutput("id", response.data.artifacts[0].id);
-
-    - shell: bash
-      if: steps.artifact-url.outputs.url != ''
-      id: download-artifact
-      run: |
-        curl -L -o ${{ inputs.artifact-name }}.zip "${{ steps.artifact-url.outputs.url }}"
-        echo "filename=${{ inputs.artifact-name }}.zip" >> $GITHUB_OUTPUT
-
-    - shell: bash
-      if: steps.download-artifact.outputs.filename != ''
-      run: |
-        unzip ${{ steps.download-artifact.outputs.filename }}
diff --git a/.github/workflows/version-check.py b/.github/workflows/version-check.py
deleted file mode 100755
index f75fd5030088..000000000000
--- a/.github/workflows/version-check.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/python3
-
-from git import Repo
-import re
-import sys
-
-
-def get_version_from_tag(tag):
-    m = re.match("llvmorg-([0-9]+)\.([0-9]+)\.([0-9]+)(-rc[0-9]+)?$", tag)
-    if m:
-        if m.lastindex == 4:
-            # We have an rc tag.
-            return m.group(1, 2, 3)
-        # We have a final release tag.
-        return (m.group(1), m.group(2), str(int(m.group(3)) + 1))
-
-    m = re.match("llvmorg-([0-9]+)-init", tag)
-    if m:
-        return (m.group(1), "1", "0")
-
-    raise Exception(f"error: Tag is not valid: {tag}")
-
-
-version = sys.argv[1]
-
-repo = Repo()
-
-tag = repo.git.describe(tags=True, abbrev=0)
-expected_version = ".".join(get_version_from_tag(tag))
-
-if version != expected_version:
-    print("error: Expected version", expected_version, "but found version", version)
-    sys.exit(1)
-
-print("Versions match:", version, expected_version)
-sys.exit(0)
diff --git a/.github/workflows/version-check.yml b/.github/workflows/version-check.yml
deleted file mode 100644
index a0a598094376..000000000000
--- a/.github/workflows/version-check.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: LLVM Project Version Check
-
-on:
-  push:
-    branches:
-      - 'release/**'
-  pull_request:
-    branches:
-      - 'release/**'
-
-permissions:
-  contents: read
-
-jobs:
-  version_check:
-    if: github.repository_owner == 'llvm'
-    runs-on: ubuntu-24.04
-    steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          pip install --require-hashes -r ./llvm/utils/git/requirements.txt
-
-      - name: Version Check
-        run: |
-          version=$(grep -o 'LLVM_VERSION_\(MAJOR\|MINOR\|PATCH\) [0-9]\+' cmake/Modules/LLVMVersion.cmake  | cut -d ' ' -f 2 | tr "\n" "." | sed 's/.$//g')
-          .github/workflows/version-check.py "$version"
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 9e00dbc9dc90..c5488c2378a1 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -315,7 +315,7 @@ struct Fragment {
     /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
     /// AngledHeaders), system headers use <> and non-system headers use "".
     /// These can match any suffix of the header file in question.
-    /// Matching is performed against the header text, not its absolute path
+    /// Matching is performed against the absolute path of the header
     /// within the project.
     std::vector<Located<std::string>> QuotedHeaders;
     /// List of regexes for headers that should always be included with a
@@ -323,7 +323,7 @@ struct Fragment {
     /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
     /// AngledHeaders), system headers use <> and non-system headers use "".
     /// These can match any suffix of the header file in question.
-    /// Matching is performed against the header text, not its absolute path
+    /// Matching is performed against the absolute path of the header
     /// within the project.
     std::vector<Located<std::string>> AngledHeaders;
   };
diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index 87fd261b906e..b9d67cc6a160 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -304,16 +304,17 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
   // FIXME: should we allow (some limited number of) "../header.h"?
   if (llvm::sys::path::is_absolute(Suggested))
     return std::nullopt;
+  auto HeaderPath = llvm::sys::path::convert_to_slash(InsertedHeader.File);
   bool IsAngled = false;
   for (auto &Filter : AngledHeaders) {
-    if (Filter(Suggested)) {
+    if (Filter(HeaderPath)) {
       IsAngled = true;
       break;
     }
   }
   bool IsQuoted = false;
   for (auto &Filter : QuotedHeaders) {
-    if (Filter(Suggested)) {
+    if (Filter(HeaderPath)) {
       IsQuoted = true;
       break;
     }
@@ -324,7 +325,7 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
     if (IsAngled && IsQuoted) {
       elog("Header '{0}' matches both quoted and angled regexes, default will "
            "be used.",
-           Suggested);
+           HeaderPath);
     }
     IsAngled = IsAngledByDefault;
   }
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index b7c64c7a0674..3c107504e625 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -938,7 +938,7 @@ TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) {
   {
     Config C;
     C.Style.AngledHeaders.push_back(
-        [](auto header) { return header == "bar.h"; });
+        [](auto header) { return header.contains("bar.h"); });
     WithContextValue WithCfg(Config::Key, std::move(C));
     Results = completions(TU, Test.point(), {Sym});
     EXPECT_THAT(Results.Completions,
@@ -947,7 +947,7 @@ TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) {
   {
     Config C;
     C.Style.QuotedHeaders.push_back(
-        [](auto header) { return header == "bar.h"; });
+        [](auto header) { return header.contains("bar.h"); });
     WithContextValue WithCfg(Config::Key, std::move(C));
     Results = completions(TU, Test.point(), {Sym});
     EXPECT_THAT(Results.Completions,
diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
index 751383e3b465..440582e14239 100644
--- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
@@ -344,6 +344,17 @@ TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketed) {
   EXPECT_EQ(calculate(BarHeader), "<sub/bar.h>");
 }
 
+TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketedFilterByFullPath) {
+  // The filter receives the full path of the header, so it is able to filter by
+  // the parent directory, even if it is part of the include search path
+  AngledHeaders.push_back([](auto Path) {
+    llvm::Regex Pattern("sub/.*");
+    return Pattern.match(Path);
+  });
+  std::string BarHeader = testPath("sub/bar.h");
+  EXPECT_EQ(calculate(BarHeader), "<bar.h>");
+}
+
 TEST_F(HeadersTest, ShortenedIncludeNotInSearchPath) {
   std::string BarHeader =
       llvm::sys::path::convert_to_slash(testPath("sub-2/bar.h"));
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index cf97ab708247..b7aa55a2e20d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -46,14 +46,18 @@ Major New Features
 Improvements to clangd
 ----------------------
 
-Inlay hints
-^^^^^^^^^^^
+Language feature support
+^^^^^^^^^^^^^^^^^^^^^^^^
 
-Diagnostics
-^^^^^^^^^^^
+- Performance improvements and bugfixes to C++20 Modules support
+- Improved support for C++23 "deducing this"
+- Improvements to objective-c++ support
 
-Semantic Highlighting
-^^^^^^^^^^^^^^^^^^^^^
+New Language Server Protocol features
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Added support for `textDocument/rangesFormatting`
+- Added support for `positionEncoding`
 
 Compile flags
 ^^^^^^^^^^^^^
@@ -64,24 +68,58 @@ Compile flags
 Hover
 ^^^^^
 
+- Fixed a bug that would sometimes prevent documentation comments of standard library functions
+  from being shown
+
 Code completion
 ^^^^^^^^^^^^^^^
 
-Code actions
-^^^^^^^^^^^^
-
-Signature help
-^^^^^^^^^^^^^^
+- Added `HeaderInsertion` config option to control whether code completion inserts a missing
+  header needed for the symbol being completed. This is equivalent to the `--header-insertion`
+  command-line option.
+- Added a `CodePatterns` config option to control whether code completion should offer code
+  patterns as completions in addition to symbols.
 
 Cross-references
 ^^^^^^^^^^^^^^^^
 
-Objective-C
+- References to symbols are now collected in array designators
+- Find-references now works for operators new and delete
+- Improvements to code navigation in templated code
+
+Call hierarchy
+^^^^^^^^^^^^^^
+
+- Call hierarchy now works with the remote index
+- Fixed a bug where call hierarchy could sometimes return bogus results
+
+Inlay hints
 ^^^^^^^^^^^
 
+- Parameter hint forwarding now works for variadic forwarding functions declared in header files
+- Improved presentation of block-end hints
+
+Code actions
+^^^^^^^^^^^^
+
+- Improved the rename refactor's name collision checking logic
+
+Clang-tidy integration
+^^^^^^^^^^^^^^^^^^^^^^
+
+- Disabled the cppcoreguidelines-macro-to-enum checker which is incompatible with clangd
+
+Include-cleaner integration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Clangd now respects the `AngledHeaders` and `QuotedHeaders` config options for headers
+  inserted to resolve include-cleaner diagnostics
+
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Various crash fixes and other stability improvements
+
 Improvements to clang-doc
 -------------------------
 
diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
index 2888e2522675..057b92c14704 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
@@ -136,7 +136,7 @@ struct Header {
   }
   StringRef verbatim() const { return std::get<Verbatim>(Storage); }
 
-  /// For phiscal files, either absolute path or path relative to the execution
+  /// For physical files, either absolute path or path relative to the execution
   /// root. Otherwise just the spelling without surrounding quotes/brackets.
   llvm::StringRef resolvedPath() const;
 
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index f4c309f1b35c..1bb73599970c 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -536,6 +536,7 @@ if( CLANG_INCLUDE_TESTS )
     clang_unit_site_config=${CMAKE_CURRENT_BINARY_DIR}/test/Unit/lit.site.cfg
   )
   add_subdirectory(test)
+  add_subdirectory(bindings/python/tests)
 
   if(CLANG_BUILT_STANDALONE)
     umbrella_lit_testsuite_end(check-all)
diff --git a/clang/bindings/python/tests/CMakeLists.txt b/clang/bindings/python/tests/CMakeLists.txt
new file mode 100644
index 000000000000..a0ddabc21bb4
--- /dev/null
+++ b/clang/bindings/python/tests/CMakeLists.txt
@@ -0,0 +1,66 @@
+# Test target to run Python test suite from main build.
+
+# Avoid configurations including '-include' from interfering with
+# our tests by setting CLANG_NO_DEFAULT_CONFIG.
+add_custom_target(check-clang-python
+    COMMAND ${CMAKE_COMMAND} -E env
+            CLANG_NO_DEFAULT_CONFIG=1
+            CLANG_LIBRARY_PATH=$<TARGET_FILE_DIR:libclang>
+            "${Python3_EXECUTABLE}" -m unittest discover
+    DEPENDS libclang
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(RUN_PYTHON_TESTS TRUE)
+set_target_properties(check-clang-python PROPERTIES FOLDER "Clang/Tests")
+
+# Tests require libclang.so which is only built with LLVM_ENABLE_PIC=ON
+if(NOT LLVM_ENABLE_PIC)
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Do not try to run if libclang was built with sanitizers because
+# the sanitizer library will likely be loaded too late to perform
+# interception and will then fail.
+# We could use LD_PRELOAD/DYLD_INSERT_LIBRARIES but this isn't
+# portable so its easier just to not run the tests when building
+# with ASan.
+if(NOT LLVM_USE_SANITIZER STREQUAL "")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Tests fail on Windows, and need someone knowledgeable to fix.
+# It's not clear whether it's a test or a valid binding problem.
+if(WIN32)
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# The Python FFI interface is broken on AIX: https://bugs.python.org/issue38628.
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# AArch64, Hexagon, and Sparc have known test failures that need to be
+# addressed.
+# SystemZ has broken Python/FFI interface:
+# https://reviews.llvm.org/D52840#1265716
+if(${LLVM_NATIVE_ARCH} MATCHES "^(AArch64|Hexagon|Sparc|SystemZ)$")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Tests will fail if cross-compiling for a different target, as tests will try
+# to use the host Python3_EXECUTABLE and make FFI calls to functions in target
+# libraries.
+if(CMAKE_CROSSCOMPILING)
+  # FIXME: Consider a solution that allows better control over these tests in
+  # a crosscompiling scenario. e.g. registering them with lit to allow them to
+  # be explicitly skipped via appropriate LIT_ARGS, or adding a mechanism to
+  # allow specifying a python interpreter compiled for the target that could
+  # be executed using qemu-user.
+  message(WARNING "check-clang-python not added to check-all as these tests fail in a cross-build setup")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+if(RUN_PYTHON_TESTS)
+    set_property(GLOBAL APPEND PROPERTY
+                 LLVM_ALL_ADDITIONAL_TEST_TARGETS check-clang-python)
+endif()
diff --git a/clang/test/bindings/python/tests/__init__.py b/clang/bindings/python/tests/__init__.py
similarity index 100%
rename from clang/test/bindings/python/tests/__init__.py
rename to clang/bindings/python/tests/__init__.py
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/a.inc b/clang/bindings/python/tests/cindex/INPUTS/a.inc
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/a.inc
rename to clang/bindings/python/tests/cindex/INPUTS/a.inc
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/b.inc b/clang/bindings/python/tests/cindex/INPUTS/b.inc
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/b.inc
rename to clang/bindings/python/tests/cindex/INPUTS/b.inc
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/compile_commands.json b/clang/bindings/python/tests/cindex/INPUTS/compile_commands.json
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/compile_commands.json
rename to clang/bindings/python/tests/cindex/INPUTS/compile_commands.json
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header1.h b/clang/bindings/python/tests/cindex/INPUTS/header1.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header1.h
rename to clang/bindings/python/tests/cindex/INPUTS/header1.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header2.h b/clang/bindings/python/tests/cindex/INPUTS/header2.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header2.h
rename to clang/bindings/python/tests/cindex/INPUTS/header2.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header3.h b/clang/bindings/python/tests/cindex/INPUTS/header3.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header3.h
rename to clang/bindings/python/tests/cindex/INPUTS/header3.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/hello.cpp b/clang/bindings/python/tests/cindex/INPUTS/hello.cpp
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/hello.cpp
rename to clang/bindings/python/tests/cindex/INPUTS/hello.cpp
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/include.cpp b/clang/bindings/python/tests/cindex/INPUTS/include.cpp
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/include.cpp
rename to clang/bindings/python/tests/cindex/INPUTS/include.cpp
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/parse_arguments.c b/clang/bindings/python/tests/cindex/INPUTS/parse_arguments.c
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/parse_arguments.c
rename to clang/bindings/python/tests/cindex/INPUTS/parse_arguments.c
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/testfile.c b/clang/bindings/python/tests/cindex/INPUTS/testfile.c
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/testfile.c
rename to clang/bindings/python/tests/cindex/INPUTS/testfile.c
diff --git a/clang/test/bindings/python/tests/cindex/__init__.py b/clang/bindings/python/tests/cindex/__init__.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/__init__.py
rename to clang/bindings/python/tests/cindex/__init__.py
diff --git a/clang/test/bindings/python/tests/cindex/test_access_specifiers.py b/clang/bindings/python/tests/cindex/test_access_specifiers.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_access_specifiers.py
rename to clang/bindings/python/tests/cindex/test_access_specifiers.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cdb.py b/clang/bindings/python/tests/cindex/test_cdb.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cdb.py
rename to clang/bindings/python/tests/cindex/test_cdb.py
diff --git a/clang/test/bindings/python/tests/cindex/test_code_completion.py b/clang/bindings/python/tests/cindex/test_code_completion.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_code_completion.py
rename to clang/bindings/python/tests/cindex/test_code_completion.py
diff --git a/clang/test/bindings/python/tests/cindex/test_comment.py b/clang/bindings/python/tests/cindex/test_comment.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_comment.py
rename to clang/bindings/python/tests/cindex/test_comment.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cursor.py
rename to clang/bindings/python/tests/cindex/test_cursor.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cursor_kind.py b/clang/bindings/python/tests/cindex/test_cursor_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cursor_kind.py
rename to clang/bindings/python/tests/cindex/test_cursor_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_diagnostics.py b/clang/bindings/python/tests/cindex/test_diagnostics.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_diagnostics.py
rename to clang/bindings/python/tests/cindex/test_diagnostics.py
diff --git a/clang/test/bindings/python/tests/cindex/test_enums.py b/clang/bindings/python/tests/cindex/test_enums.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_enums.py
rename to clang/bindings/python/tests/cindex/test_enums.py
diff --git a/clang/test/bindings/python/tests/cindex/test_exception_specification_kind.py b/clang/bindings/python/tests/cindex/test_exception_specification_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_exception_specification_kind.py
rename to clang/bindings/python/tests/cindex/test_exception_specification_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_file.py b/clang/bindings/python/tests/cindex/test_file.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_file.py
rename to clang/bindings/python/tests/cindex/test_file.py
diff --git a/clang/test/bindings/python/tests/cindex/test_index.py b/clang/bindings/python/tests/cindex/test_index.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_index.py
rename to clang/bindings/python/tests/cindex/test_index.py
diff --git a/clang/test/bindings/python/tests/cindex/test_lib.py b/clang/bindings/python/tests/cindex/test_lib.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_lib.py
rename to clang/bindings/python/tests/cindex/test_lib.py
diff --git a/clang/test/bindings/python/tests/cindex/test_linkage.py b/clang/bindings/python/tests/cindex/test_linkage.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_linkage.py
rename to clang/bindings/python/tests/cindex/test_linkage.py
diff --git a/clang/test/bindings/python/tests/cindex/test_location.py b/clang/bindings/python/tests/cindex/test_location.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_location.py
rename to clang/bindings/python/tests/cindex/test_location.py
diff --git a/clang/test/bindings/python/tests/cindex/test_rewrite.py b/clang/bindings/python/tests/cindex/test_rewrite.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_rewrite.py
rename to clang/bindings/python/tests/cindex/test_rewrite.py
diff --git a/clang/test/bindings/python/tests/cindex/test_source_range.py b/clang/bindings/python/tests/cindex/test_source_range.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_source_range.py
rename to clang/bindings/python/tests/cindex/test_source_range.py
diff --git a/clang/test/bindings/python/tests/cindex/test_tls_kind.py b/clang/bindings/python/tests/cindex/test_tls_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_tls_kind.py
rename to clang/bindings/python/tests/cindex/test_tls_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_token_kind.py b/clang/bindings/python/tests/cindex/test_token_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_token_kind.py
rename to clang/bindings/python/tests/cindex/test_token_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_tokens.py b/clang/bindings/python/tests/cindex/test_tokens.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_tokens.py
rename to clang/bindings/python/tests/cindex/test_tokens.py
diff --git a/clang/test/bindings/python/tests/cindex/test_translation_unit.py b/clang/bindings/python/tests/cindex/test_translation_unit.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_translation_unit.py
rename to clang/bindings/python/tests/cindex/test_translation_unit.py
diff --git a/clang/test/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_type.py
rename to clang/bindings/python/tests/cindex/test_type.py
diff --git a/clang/test/bindings/python/tests/cindex/util.py b/clang/bindings/python/tests/cindex/util.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/util.py
rename to clang/bindings/python/tests/cindex/util.py
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0e21ef0244f7..cdeaf3b2509c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6390,6 +6390,14 @@ the configuration (without a prefix: ``Auto``).
        IF (...)                        vs.    IF(...)
          <conditional-body>                     <conditional-body>
 
+  * ``bool AfterNot`` If ``true``, put a space between alternative operator ``not`` and the
+    opening parenthesis.
+
+    .. code-block:: c++
+
+       true:                                  false:
+       return not (a || b);            vs.    return not(a || b);
+
   * ``bool AfterOverloadedOperator`` If ``true``, put a space between operator overloading and opening
     parentheses.
 
diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst
index 913291c95444..96eb498bc48b 100644
--- a/clang/docs/PointerAuthentication.rst
+++ b/clang/docs/PointerAuthentication.rst
@@ -47,16 +47,16 @@ This document serves four purposes:
 - It documents several language extensions that are useful on targets using
   pointer authentication.
 
-- It will eventually present a theory of operation for the security mitigation,
-  describing the basic requirements for correctness, various weaknesses in the
-  mechanism, and ways in which programmers can strengthen its protections
-  (including recommendations for language implementors).
+- It presents a theory of operation for the security mitigation, describing the
+  basic requirements for correctness, various weaknesses in the mechanism, and
+  ways in which programmers can strengthen its protections (including
+  recommendations for language implementors).
 
-- It will eventually document the language ABIs currently used for C, C++,
-  Objective-C, and Swift on arm64e, although these are not yet stable on any
-  target.
+- It documents the stable ABI of the C, C++, and Objective-C languages on arm64e
+  platforms.
 
-Basic Concepts
+
+Basic concepts
 --------------
 
 The simple address of an object or function is a **raw pointer**.  A raw
@@ -125,7 +125,7 @@ independently for I and D keys.)
   interfaces or as primitives in a compiler IR because they expose raw
   pointers.  Raw pointers require special attention in the language
   implementation to avoid the accidental creation of exploitable code
-  sequences.
+  sequences; see the section on `Attackable code sequences`_.
 
 The following details are all implementation-defined:
 
@@ -167,10 +167,15 @@ a cryptographic signature, other implementations may be possible.  See
     signing key, and stores it in the high bits as the signature. ``auth``
     removes the signature, computes the same hash, and compares the result with
     the stored signature.  ``strip`` removes the signature without
-    authenticating it.  While ``aut*`` instructions do not themselves trap on
-    failure in Armv8.3 PAuth, they do with the later optional FPAC extension.
-    An implementation can also choose to emulate this trapping behavior by
-    emitting additional instructions around ``aut*``.
+    authenticating it.  The ``aut`` instructions in the baseline Armv8.3 PAuth
+    feature do not guarantee to trap on authentication failure; instead, they
+    simply corrupt the pointer so that later uses will likely trap. Unless the
+    "later use" follows immediately and cannot be recovered from (e.g. with a
+    signal handler), this does not provide adequate protection against
+    `authentication oracles`_, so implementations must emit additional
+    instructions to force an immediate trap. This is unnecessary if the
+    processor provides the optional ``FPAC`` extension, which guarantees an
+    immediate trap.
 
   - ``sign_generic`` corresponds to the ``pacga`` instruction, which takes two
     64-bit values and produces a 64-bit cryptographic hash. Implementations of
@@ -234,7 +239,7 @@ implementation-defined.
 
 .. _Signing schemas:
 
-Signing Schemas
+Signing schemas
 ~~~~~~~~~~~~~~~
 
 Correct use of pointer authentication requires the signing code and the
@@ -255,33 +260,172 @@ signing schema breaks down even more simply:
 It is important that the signing schema be independently derived at all signing
 and authentication sites.  Preferably, the schema should be hard-coded
 everywhere it is needed, but at the very least, it must not be derived by
-inspecting information stored along with the pointer.
+inspecting information stored along with the pointer.  See the section on
+`Attacks on pointer authentication`_ for more information.
 
-Language Features
------------------
 
-There is currently one main pointer authentication language feature:
+Language features
+-----------------
 
-- The language provides the ``<ptrauth.h>`` intrinsic interface for manually
-  signing and authenticating pointers in code.  These can be used in
+There are three levels of the pointer authentication language feature:
+
+- The language implementation automatically signs and authenticates function
+  pointers (and certain data pointers) across a variety of standard situations,
+  including return addresses, function pointers, and C++ virtual functions. The
+  intent is for all pointers to code in program memory to be signed in some way
+  and for all branches to code in program text to authenticate those
+  signatures. In addition to the code pointers themselves, we also use pointer
+  authentication to protect data values that directly or indirectly influence
+  control flow or program integrity, or can provide attackers with some other
+  powerful program compromise.
+
+- The language also provides extensions to override the default rules used by
+  the language implementation.  For example, the ``__ptrauth`` type qualifier
+  can be used to change how pointers or pointer sized integers are signed when
+  they are stored in a particular variable or field; this provides much stronger
+  protection than is guaranteed by the default rules for C function and data
+  pointers.
+
+- Finally, the language provides the ``<ptrauth.h>`` intrinsic interface for
+  manually signing and authenticating pointers in code.  These can be used in
   circumstances where very specific behavior is required.
 
+Language implementation
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For the most part, pointer authentication is an unobserved detail of the
+implementation of the programming language.  Any element of the language
+implementation that would perform an indirect branch to a pointer is implicitly
+altered so that the pointer is signed when first constructed and authenticated
+when the branch is performed.  This includes:
+
+- indirect-call features in the programming language, such as C function
+  pointers, C++ virtual functions, C++ member function pointers, the "blocks"
+  C extension, and so on;
+
+- returning from a function, no matter how it is called; and
+
+- indirect calls introduced by the implementation, such as branches through the
+  global offset table (GOT) used to implement direct calls to functions defined
+  outside of the current shared object.
+
+For more information about this, see the `Language ABI`_ section.
+
+However, some aspects of the implementation are observable by the programmer or
+otherwise require special notice.
+
+C data pointers
+^^^^^^^^^^^^^^^
+
+The current implementation in Clang does not sign pointers to ordinary data by
+default. For a partial explanation of the reasoning behind this, see the
+`Theory of Operation`_ section.
+
+A specific data pointer which is more security-sensitive than most can be
+signed using the `__ptrauth qualifier`_ or using the ``<ptrauth.h>``
+intrinsics.
+
+C function pointers
+^^^^^^^^^^^^^^^^^^^
+
+The C standard imposes restrictions on the representation and semantics of
+function pointer types which make it difficult to achieve satisfactory
+signature diversity in the default language rules.  See `Attacks on pointer
+authentication`_ for more information about signature diversity.  Programmers
+should strongly consider using the ``__ptrauth`` qualifier to improve the
+protections for important function pointers, such as the components of of
+a hand-rolled "v-table"; see the section on the `__ptrauth qualifier`_ for
+details.
+
+The value of a pointer to a C function includes a signature, even when the
+value is cast to a non-function-pointer type like ``void*`` or ``intptr_t``. On
+implementations that use high bits to store the signature, this means that
+relational comparisons and hashes will vary according to the exact signature
+value, which is likely to change between executions of a program.  In some
+implementations, it may also vary based on the exact function pointer type.
+
+Null pointers
+^^^^^^^^^^^^^
+
+In principle, an implementation could derive the signed null pointer value
+simply by applying the standard signing algorithm to the raw null pointer
+value. However, for likely signing algorithms, this would mean that the signed
+null pointer value would no longer be statically known, which would have many
+negative consequences.  For one, it would become substantially more expensive
+to emit null pointer values or to perform null-pointer checks.  For another,
+the pervasive (even if technically unportable) assumption that null pointers
+are bitwise zero would be invalidated, making it substantially more difficult
+to adopt pointer authentication, as well as weakening common optimizations for
+zero-initialized memory such as the use of ``.bzz`` sections.  Therefore it is
+beneficial to treat null pointers specially by giving them their usual
+representation.  On AArch64, this requires additional code when working with
+possibly-null pointers, such as when copying a pointer field that has been
+signed with address diversity.
+
+While this representation of nulls is the safest option for the general case,
+there are some situations in which a null pointer may have important semantic
+or security impact. For that purpose Clang has the concept of a pointer
+authentication schema that signs and authenticates null values.
+
+Return addresses
+^^^^^^^^^^^^^^^^
+
+The current implementation in Clang implicitly signs the return addresses in
+function calls.  While the value of the return address is technically an
+implementation detail of a function, there are some important libraries and
+development tools which rely on manually walking the chain of stack frames.
+These tools must be updated to correctly account for pointer authentication,
+either by stripping signatures (if security is not important for the tool, e.g.
+if it is capturing a stack trace during a crash) or properly authenticating
+them.  More information about how these values are signed is available in the
+`Language ABI`_ section.
+
+C++ virtual functions
+^^^^^^^^^^^^^^^^^^^^^
+
+The current implementation in Clang signs virtual function pointers with
+a discriminator derived from the full signature of the overridden method,
+including the method name and parameter types.  It is possible to write C++
+code that relies on v-table layout remaining constant despite changes to
+a method signature; for example, a parameter might be a ``typedef`` that
+resolves to a different type based on a build setting.  Such code violates
+C++'s One Definition Rule (ODR), but that violation is not normally detected;
+however, pointer authentication will detect it.
 
-Language Extensions
+Language extensions
 ~~~~~~~~~~~~~~~~~~~
 
-Feature Testing
+Feature testing
 ^^^^^^^^^^^^^^^
 
 Whether the current target uses pointer authentication can be tested for with
 a number of different tests.
 
-- ``__has_feature(ptrauth_intrinsics)`` is true if ``<ptrauth.h>`` provides its
-  normal interface.  This may be true even on targets where pointer
-  authentication is not enabled by default.
+- ``__PTRAUTH__`` macro is defined if ``<ptrauth.h>`` provides its normal
+  interface. This implies support for the pointer authentication intrinsics
+  and the ``__ptrauth`` qualifier.
 
-__ptrauth Qualifier
-^^^^^^^^^^^^^^^^^^^
+- ``__has_feature(ptrauth_returns)`` is true if the target uses pointer
+  authentication to protect return addresses.
+
+- ``__has_feature(ptrauth_calls)`` is true if the target uses pointer
+  authentication to protect indirect branches.  On arm64e this implies
+  ``__has_feature(ptrauth_returns)``, ``__has_feature(ptrauth_intrinsics)``,
+  and the ``__PTRAUTH__`` macro.
+
+- For backwards compatibility purposes ``__has_feature(ptrauth_intrinsics)``
+  and ``__has_feature(ptrauth_qualifier)`` are available on arm64e targets.
+  These features are synonymous with each other, and are equivalent to testing
+  for the ``__PTRAUTH__`` macro definition. Use of these features should be
+  restricted to cases where backwards compatibility is required, and should be
+  paired with ``defined(__PTRAUTH__)``.
+
+
+Clang provides several other tests only for historical purposes; for current
+purposes they are all equivalent to ``ptrauth_calls``.
+
+``__ptrauth`` qualifier
+^^^^^^^^^^^^^^^^^^^^^^^
 
 ``__ptrauth(key, address, discriminator)`` is an extended type
 qualifier which causes so-qualified objects to hold pointers or pointer sized
@@ -293,6 +437,11 @@ type, either to a function or to an object, or a pointer sized integer.  It
 currently cannot be an Objective-C pointer type, a C++ reference type, or a
 block pointer type; these restrictions may be lifted in the future.
 
+The current implementation in Clang is known to not provide adequate safety
+guarantees against the creation of `signing oracles`_ when assigning data
+pointers to ``__ptrauth``-qualified gl-values.  See the section on `safe
+derivation`_ for more information.
+
 The qualifier's operands are as follows:
 
 - ``key`` - an expression evaluating to a key value from ``<ptrauth.h>``; must
@@ -327,6 +476,57 @@ a discriminator determined as follows:
   is ``ptrauth_blend_discriminator(&x, discriminator)``; see
   `ptrauth_blend_discriminator`_.
 
+Non-triviality from address diversity
++++++++++++++++++++++++++++++++++++++
+
+Address diversity must impose additional restrictions in order to allow the
+implementation to correctly copy values.  In C++, a type qualified with address
+diversity is treated like a class type with non-trivial copy/move constructors
+and assignment operators, with the usual effect on containing classes and
+unions.  C does not have a standard concept of non-triviality, and so we must
+describe the basic rules here, with the intention of imitating the emergent
+rules of C++:
+
+- A type may be **non-trivial to copy**.
+
+- A type may also be **illegal to copy**. Types that are illegal to copy are
+  always non-trivial to copy.
+
+- A type may also be **address-sensitive**. This includes types that use self
+  referencing pointers, data protected by address diversified pointer
+  authentication, or other similar concepts.
+
+- A type qualified with a ``ptrauth`` qualifier or implicit authentication
+  schema that requires address diversity is non-trivial to copy and
+  address-sensitive.
+
+- An array type is illegal to copy, non-trivial to copy, or address-sensitive
+  if its element type is illegal to copy, non-trivial to copy, or
+  address-sensitive, respectively.
+
+- A struct type is illegal to copy, non-trivial to copy, or address-sensitive
+  if it has a field whose type is illegal to copy, non-trivial to copy, or
+  address-sensitive, respectively.
+
+- A union type is both illegal and non-trivial to copy if it has a field whose
+  type is non-trivial or illegal to copy.
+
+- A union type is address-sensitive if it has a field whose type is
+  address-sensitive.
+
+- A program is ill-formed if it uses a type that is illegal to copy as
+  a function parameter, argument, or return type.
+
+- A program is ill-formed if an expression requires a type to be copied that is
+  illegal to copy.
+
+- Otherwise, copying a type that is non-trivial to copy correctly copies its
+  subobjects.
+
+- Types that are address-sensitive must always be passed and returned
+  indirectly. Thus, changing the address-sensitivity of a type may be
+  ABI-breaking even if its size and alignment do not change.
+
 ``<ptrauth.h>``
 ~~~~~~~~~~~~~~~
 
@@ -433,7 +633,7 @@ Produce a signed pointer for the given raw pointer without applying any
 authentication or extra treatment.  This operation is not required to have the
 same behavior on a null pointer that the language implementation would.
 
-This is a treacherous operation that can easily result in signing oracles.
+This is a treacherous operation that can easily result in `signing oracles`_.
 Programs should use it seldom and carefully.
 
 ``ptrauth_auth_and_resign``
@@ -454,7 +654,29 @@ a null pointer that the language implementation would.
 The code sequence produced for this operation must not be directly attackable.
 However, if the discriminator values are not constant integers, their
 computations may still be attackable.  In the future, Clang should be enhanced
-to guaranteed non-attackability if these expressions are safely-derived.
+to guaranteed non-attackability if these expressions are
+:ref:`safely-derived<Safe derivation>`.
+
+``ptrauth_auth_function``
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_function(pointer, key, discriminator)
+
+Authenticate that ``pointer`` is signed with ``key`` and ``discriminator`` and
+re-sign it to the standard schema for a function pointer of its type.
+
+``pointer`` must have function pointer type.  The result will have the same
+type as ``pointer``.  This operation is not required to have the same behavior
+on a null pointer that the language implementation would.
+
+This operation makes the same attackability guarantees as
+``ptrauth_auth_and_resign``.
+
+If this operation appears syntactically as the function operand of a call,
+Clang guarantees that the call will directly authenticate the function value
+using the given schema rather than re-signing to the standard schema.
 
 ``ptrauth_auth_data``
 ^^^^^^^^^^^^^^^^^^^^^
@@ -500,12 +722,921 @@ type.  Implementations are not required to make all bits of the result equally
 significant; in particular, some implementations are known to not leave
 meaningful data in the low bits.
 
+Standard ``__ptrauth`` qualifiers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``<ptrauth.h>`` additionally provides several macros which expand to
+``__ptrauth`` qualifiers for common ABI situations.
+
+For convenience, these macros expand to nothing when pointer authentication is
+disabled.
+
+These macros can be found in the header; some details of these macros may be
+unstable or implementation-specific.
+
+
+Theory of operation
+-------------------
+
+The threat model of pointer authentication is as follows:
+
+- The attacker has the ability to read and write to a certain range of
+  addresses, possibly the entire address space.  However, they are constrained
+  by the normal rules of the process: for example, they cannot write to memory
+  that is mapped read-only, and if they access unmapped memory it will trigger
+  a trap.
+
+- The attacker has no ability to add arbitrary executable code to the program.
+  For example, the program does not include malicious code to begin with, and
+  the attacker cannot alter existing instructions, load a malicious shared
+  library, or remap writable pages as executable.  If the attacker wants to get
+  the process to perform a specific sequence of actions, they must somehow
+  subvert the normal control flow of the process.
+
+In both of the above paragraphs, it is merely assumed that the attacker's
+*current* capabilities are restricted; that is, their current exploit does not
+directly give them the power to do these things.  The attacker's immediate goal
+may well be to leverage their exploit to gain these capabilities, e.g. to load
+a malicious dynamic library into the process, even though the process does not
+directly contain code to do so.
+
+Note that any bug that fits the above threat model can be immediately exploited
+as a denial-of-service attack by simply performing an illegal access and
+crashing the program.  Pointer authentication cannot protect against this.
+While denial-of-service attacks are unfortunate, they are also unquestionably
+the best possible result of a bug this severe. Therefore, pointer authentication
+enthusiastically embraces the idea of halting the program on a pointer
+authentication failure rather than continuing in a possibly-compromised state.
+
+Pointer authentication is a form of control-flow integrity (CFI) enforcement.
+The basic security hypothesis behind CFI enforcement is that many bugs can only
+be usefully exploited (other than as a denial-of-service) by leveraging them to
+subvert the control flow of the program.  If this is true, then by inhibiting or
+limiting that subversion, it may be possible to largely mitigate the security
+consequences of those bugs by rendering them impractical (or, ideally,
+impossible) to exploit.
+
+Every indirect branch in a program has a purpose.  Using human intelligence, a
+programmer can describe where a particular branch *should* go according to this
+purpose: a ``return`` in ``printf`` should return to the call site, a particular
+call in ``qsort`` should call the comparator that was passed in as an argument,
+and so on.  But for CFI to enforce that every branch in a program goes where it
+*should* in this sense would require CFI to perfectly enforce every semantic
+rule of the program's abstract machine; that is, it would require making the
+programming environment perfectly sound.  That is out of scope.  Instead, the
+goal of CFI is merely to catch attempts to make a branch go somewhere that its
+obviously *shouldn't* for its purpose: for example, to stop a call from
+branching into the middle of a function rather than its beginning.  As the
+information available to CFI gets better about the purpose of the branch, CFI
+can enforce tighter and tighter restrictions on where the branch is permitted to
+go.  Still, ultimately CFI cannot make the program sound.  This may help explain
+why pointer authentication makes some of the choices it does: for example, to
+sign and authenticate mostly code pointers rather than every pointer in the
+program.  Preventing attackers from redirecting branches is both particularly
+important and particularly approachable as a goal.  Detecting corruption more
+broadly is infeasible with these techniques, and the attempt would have far
+higher cost.
+
+Attacks on pointer authentication
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pointer authentication works as follows.  Every indirect branch in a program has
+a purpose.  For every purpose, the implementation chooses a
+:ref:`signing schema<Signing schemas>`.  At some place where a pointer is known
+to be correct for its purpose, it is signed according to the purpose's schema.
+At every place where the pointer is needed for its purpose, it is authenticated
+according to the purpose's schema.  If that authentication fails, the program is
+halted.
+
+There are a variety of ways to attack this.
+
+Attacks of interest to programmers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These attacks arise from weaknesses in the default protections offered by
+pointer authentication.  They can be addressed by using attributes or intrinsics
+to opt in to stronger protection.
+
+Substitution attacks
+++++++++++++++++++++
+
+An attacker can simply overwrite a pointer intended for one purpose with a
+pointer intended for another purpose if both purposes use the same signing
+schema and that schema does not use address diversity.
+
+The most common source of this weakness is when code relies on using the default
+language rules for C function pointers.  The current implementation uses the
+exact same signing schema for all C function pointers, even for functions of
+substantially different type.  While efforts are ongoing to improve constant
+diversity for C function pointers of different type, there are necessary limits
+to this.  The C standard requires function pointers to be copyable with
+``memcpy``, which means that function pointers can never use address diversity.
+Furthermore, even if a function pointer can only be replaced with another
+function of the exact same type, that can still be useful to an attacker, as in
+the following example of a hand-rolled "v-table":
+
+.. code-block:: c
+
+  struct ObjectOperations {
+    void (*retain)(Object *);
+    void (*release)(Object *);
+    void (*deallocate)(Object *);
+    void (*logStatus)(Object *);
+  };
+
+The weakness in this design is that by lacking any context specific
+discriminator, this means an attacker can substitute any of these fields with
+any other function pointer signed with the default schema. Similarly the lack of
+address diversity allows an attacker to replace the functions in one type's
+"v-table" with those of another. This can be mitigated by overriding the default
+authentication schema with a more specific signing schema for each purpose.  For
+instance, in this example, the ``__ptrauth`` qualifier can be used with a
+different constant discriminator for each field.  Since there's no particular
+reason it's important for this v-table to be copyable with ``memcpy``, the
+functions can also be signed with address diversity:
+
+.. code-block:: c
+
+  #if defined(__PTRAUTH__)
+  #define objectOperation(discriminator) \
+    __ptrauth(ptrauth_key_function_pointer, 1, discriminator)
+  #else
+  #define objectOperation(discriminator)
+  #endif
+
+  struct ObjectOperations {
+    void (*objectOperation(0xf017) retain)(Object *);
+    void (*objectOperation(0x2639) release)(Object *);
+    void (*objectOperation(0x8bb0) deallocate)(Object *);
+    void (*objectOperation(0xc5d4) logStatus)(Object *);
+  };
+
+This weakness can also sometimes be mitigated by simply keeping the signed
+pointer in constant memory, but this is less effective than using better signing
+diversity.
+
+.. _Access path attacks:
+
+Access path attacks
++++++++++++++++++++
+
+If a signed pointer is often accessed indirectly (that is, by first loading the
+address of the object where the signed pointer is stored), an attacker can
+affect uses of it by overwriting the intermediate pointer in the access path.
+
+The most common scenario exhibiting this weakness is an object with a pointer to
+a "v-table" (a structure holding many function pointers). An attacker does not
+need to replace a signed function pointer in the v-table if they can instead
+simply replace the v-table pointer in the object with their own pointer ---
+perhaps to memory where they've constructed their own v-table, or to existing
+memory that coincidentally happens to contain a signed pointer at the right
+offset that's been signed with the right signing schema.
+
+This attack arises because data pointers are not signed by default. It works
+even if the signed pointer uses address diversity: address diversity merely
+means that each pointer is signed with its own storage address,
+which (by design) is invariant to changes in the accessing pointer.
+
+Using sufficiently diverse signing schemas within the v-table can provide
+reasonably strong mitigation against this weakness.  Always use address and type
+diversity in v-tables to prevent attackers from assembling their own v-table.
+Avoid re-using constant discriminators to prevent attackers from replacing a
+v-table pointer with a pointer to totally unrelated memory that just happens to
+contain an similarly-signed pointer, or reused memory containing a different
+type.
+
+Further mitigation can be attained by signing pointers to v-tables. Any
+signature at all should prevent attackers from forging v-table pointers; they
+will need to somehow harvest an existing signed pointer from elsewhere in
+memory.  Using a meaningful constant discriminator will force this to be
+harvested from an object with similar structure (e.g. a different implementation
+of the same interface).  Using address diversity will prevent such harvesting
+entirely.  However, care must be taken when sourcing the v-table pointer
+originally; do not blindly sign a pointer that is not
+:ref:`safely derived<Safe derivation>`.
+
+.. _Signing oracles:
+
+Signing oracles
++++++++++++++++
+
+A signing oracle is a bit of code which can be exploited by an attacker to sign
+an arbitrary pointer in a way that can later be recovered.  Such oracles can be
+used by attackers to forge signatures matching the oracle's signing schema,
+which is likely to cause a total compromise of pointer authentication's
+effectiveness.
+
+This attack only affects ordinary programmers if they are using certain
+treacherous patterns of code.  Currently this includes:
+
+- all uses of the ``__ptrauth_sign_unauthenticated`` intrinsic and
+- assigning values to ``__ptrauth``-qualified l-values.
+
+Care must be taken in these situations to ensure that the pointer being signed
+has been :ref:`safely derived<Safe derivation>` or is otherwise not possible to
+attack.  (In some cases, this may be challenging without compiler support.)
+
+A diagnostic will be added in the future for implicitly dangerous patterns of
+code, such as assigning a non-safely-derived values to a
+``__ptrauth``-qualified l-value.
+
+.. _Authentication oracles:
+
+Authentication oracles
+++++++++++++++++++++++
+
+An authentication oracle is a bit of code which can be exploited by an attacker
+to leak whether a signed pointer is validly signed without halting the program
+if it isn't.  Such oracles can be used to forge signatures matching the oracle's
+signing schema if the attacker can repeatedly invoke the oracle for different
+candidate signed pointers. This is likely to cause a total compromise of pointer
+authentication's effectiveness.
+
+There should be no way for an ordinary programmer to create an authentication
+oracle using the current set of operations. However, implementation flaws in the
+past have occasionally given rise to authentication oracles due to a failure to
+immediately trap on authentication failure.
+
+The likelihood of creating an authentication oracle is why there is currently no
+intrinsic which queries whether a signed pointer is validly signed.
+
+
+Attacks of interest to implementors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These attacks are not inherent to the model; they arise from mistakes in either
+implementing or using the `sign` and `auth` operations. Avoiding these mistakes
+requires careful work throughout the system.
+
+Failure to trap on authentication failure
++++++++++++++++++++++++++++++++++++++++++
+
+Any failure to halt the program on an authentication failure is likely to be
+exploitable by attackers to create an
+:ref:`authentication oracle<Authentication oracles>`.
+
+There are several different ways to introduce this problem:
+
+- The implementation might try to halt the program in some way that can be
+  intercepted.
+
+  For example, the Armv8.3 ``aut`` instructions do not directly trap on
+  authentication failure on processors that lack the ``FPAC`` extension.
+  Instead, they corrupt their results to be invalid pointers, with the idea that
+  subsequent uses of those pointers will trigger traps as bad memory accesses.
+  However, most kernels do not immediately halt programs that trap due to bad
+  memory accesses; instead, they notify the process to give it an opportunity to
+  recover. If this happens with an ``auth`` failure, the attacker may be able to
+  exploit the recovery path in a way that creates an oracle. Kernels must
+  provide a way for a process to trap unrecoverably, and this should cover all
+  ``FPAC`` traps. Compilers must ensure that ``auth`` failures trigger an
+  unrecoverable trap, ideally by taking advantage of ``FPAC``, but if necessary
+  by emitting extra instructions.
+
+- A compiler might use an intermediate representation (IR) for ``sign`` and
+  ``auth`` operations that cannot make adequate correctness guarantees.
+
+  For example, suppose that an IR uses ARMv8.3-like semantics for ``auth``: the
+  operation merely corrupts its result on failure instead of promising to trap.
+  A frontend might emit patterns of IR that always follow an ``auth`` with a
+  memory access, thinking that this ensures correctness. But if the IR can be
+  transformed to insert code between the ``auth`` and the access, or if the
+  ``auth`` can be speculated, then this potentially creates an oracle.  It is
+  better for ``auth`` to semantically guarantee to trap, potentially requiring
+  an explicit check in the generated code. An ARMv8.3-like target can avoid this
+  explicit check in the common case by recognizing the pattern of an ``auth``
+  followed immediately by an access.
+
+Attackable code sequences
++++++++++++++++++++++++++
+
+If code that is part of a pointer authentication operation is interleaved with
+code that may itself be vulnerable to attacks, an attacker may be able to use
+this to create a :ref:`signing<Signing oracles>` or
+:ref:`authentication<Authentication oracles>` oracle.
+
+For example, suppose that the compiler is generating a call to a function and
+passing two arguments: a signed constant pointer and a value derived from a
+call.  In ARMv8.3, this code might look like so:
+
+.. code-block:: asm
+
+  adr x19, _callback.        ; compute &_callback
+  paciza x19                 ; sign it with a constant discriminator of 0
+  blr _argGenerator          ; call _argGenerator() (returns in x0)
+  mov x1, x0                 ; move call result to second arg register
+  mov x0, x19                ; move signed &_callback to first arg register
+  blr _function              ; call _function
+
+This code is correct, as would be a sequencing that does *both* the ``adr`` and
+the ``paciza`` after the call to ``_argGenerator``.  But a sequence that
+computes the address of ``_callback`` but leaves it as a raw pointer in a
+register during the call to ``_argGenerator`` would be vulnerable:
+
+.. code-block:: asm
+
+  adr x19, _callback.        ; compute &_callback
+  blr _argGenerator          ; call _argGenerator() (returns in x0)
+  mov x1, x0                 ; move call result to second arg register
+  paciza x19                 ; sign &_callback
+  mov x0, x19                ; move signed &_callback to first arg register
+  blr _function              ; call _function
+
+If ``_argGenerator`` spills ``x19`` (a callee-save register), and if the
+attacker can perform a write during this call, then the attacker can overwrite
+the spill slot with an arbitrary pointer that will eventually be unconditionally
+signed after the function returns.  This would be a signing oracle.
+
+The implementation can avoid this by obeying two basic rules:
+
+- The compiler's intermediate representations (IR) should not provide operations
+  that expose intermediate raw pointers.  This may require providing extra
+  operations that perform useful combinations of operations.
+
+  For example, there should be an "atomic" auth-and-resign operation that should
+  be used instead of emitting an ``auth`` operation whose result is fed into a
+  ``sign``.
+
+  Similarly, if a pointer should be authenticated as part of doing a memory
+  access or a call, then the access or call should be decorated with enough
+  information to perform the authentication; there should not be a separate
+  ``auth`` whose result is used as the pointer operand for the access or call.
+  (In LLVM IR, we do this for calls, but not yet for loads or stores.)
+
+  "Operations" includes things like materializing a signed value to a known
+  function or global variable.  The compiler must be able to recognize and emit
+  this as a unified operation, rather than potentially splitting it up as in
+  the example above.
+
+- The compiler backend should not be too aggressive about scheduling
+  instructions that are part of a pointer authentication operation. This may
+  require custom code-generation of these operations in some cases.
+
+Register clobbering
++++++++++++++++++++
+
+As a refinement of the section on `Attackable code sequences`_, if the attacker
+has the ability to modify arbitrary *register* state at arbitrary points in the
+program, then special care must be taken.
+
+For example, ARMv8.3 might materialize a signed function pointer like so:
+
+.. code-block:: asm
+
+  adr x0, _callback.        ; compute &_callback
+  paciza x0                 ; sign it with a constant discriminator of 0
+
+If an attacker has the ability to overwrite ``x0`` between these two
+instructions, this code sequence is vulnerable to becoming a signing oracle.
+
+For the most part, this sort of attack is not possible: it is a basic element of
+the design of modern computation that register state is private and inviolable.
+However, in systems that support asynchronous interrupts, this property requires
+the cooperation of the interrupt-handling code. If that code saves register
+state to memory, and that memory can be overwritten by an attacker, then
+essentially the attack can overwrite arbitrary register state at an arbitrary
+point.  This could be a concern if the threat model includes attacks on the
+kernel or if the program uses user-space preemptive multitasking.
+
+(Readers might object that an attacker cannot rely on asynchronous interrupts
+triggering at an exact instruction boundary.  In fact, researchers have had some
+success in doing exactly that.  Even ignoring that, though, we should aim to
+protect against lucky attackers just as much as good ones.)
+
+To protect against this, saved register state must be at least partially signed
+(using something like `ptrauth_sign_generic_data`_).  This is required for
+correctness anyway because saved thread states include security-critical
+registers such as SP, FP, PC, and LR (where applicable).  Ideally, this
+signature would cover all the registers, but since saving and restoring
+registers can be very performance-sensitive, that may not be acceptable. It is
+sufficient to set aside a small number of scratch registers that will be
+guaranteed to be preserved correctly; the compiler can then be careful to only
+store critical values like intermediate raw pointers in those registers.
+
+``setjmp`` and ``longjmp`` should sign and authenticate the core registers (SP,
+FP, PC, and LR), but they do not need to worry about intermediate values because
+``setjmp`` can only be called synchronously, and the compiler should never
+schedule pointer-authentication operations interleaved with arbitrary calls.
+
+.. _Relative addresses:
+
+Attacks on relative addressing
+++++++++++++++++++++++++++++++
+
+Relative addressing is a technique used to compress and reduce the load-time
+cost of infrequently-used global data.  The pointer authentication system is
+unlikely to support signing or authenticating a relative address, and in most
+cases it would defeat the point to do so: it would take additional storage
+space, and applying the signature would take extra work at load time.
+
+Relative addressing is not precluded by the use of pointer authentication, but
+it does take extra considerations to make it secure:
+
+- Relative addresses must only be stored in read-only memory.  A writable
+  relative address can be overwritten to point nearly anywhere, making it
+  inherently insecure; this danger can only be compensated for with techniques
+  for protecting arbitrary data like `ptrauth_sign_generic_data`_.
+
+- Relative addresses must only be accessed through signed pointers with adequate
+  diversity.  If an attacker can perform an `access path attack` to replace the
+  pointer through which the relative address is accessed, they can easily cause
+  the relative address to point wherever they want.
+
+Signature forging
++++++++++++++++++
+
+If an attacker can exactly reproduce the behavior of the signing algorithm, and
+they know all the correct inputs to it, then they can perfectly forge a
+signature on an arbitrary pointer.
+
+There are three components to avoiding this mistake:
+
+- The abstract signing algorithm should be good: it should not have glaring
+  flaws which would allow attackers to predict its result with better than
+  random accuracy without knowing all the inputs (like the key values).
+
+- The key values should be kept secret.  If at all possible, they should never
+  be stored in accessible memory, or perhaps only stored encrypted.
+
+- Contexts that are meant to be independently protected should use different
+  key values.  For example, the kernel should not use the same keys as user
+  processes.  Different user processes should also use different keys from each
+  other as much as possible, although this may pose its own technical
+  challenges.
+
+Remapping
++++++++++
+
+If an attacker can change the memory protections on certain pages of the
+program's memory, that can substantially weaken the protections afforded by
+pointer authentication.
+
+- If an attacker can inject their own executable code, they can also certainly
+  inject code that can be used as a :ref:`signing oracle<Signing Oracles>`.
+  The same is true if they can write to the instruction stream.
+
+- If an attacker can remap read-only program data sections to be writable, then
+  any use of :ref:`relative addresses` in global data becomes insecure.
+
+- On platforms that use them, if an attacker can remap the memory containing
+  the `global offset tables`_ as writable, then any unsigned pointers in those
+  tables are insecure.
+
+Remapping memory in this way often requires the attacker to have already
+substantively subverted the control flow of the process.  Nonetheless, if the
+operating system has a mechanism for mapping pages in a way that cannot be
+remapped, this should be used wherever possible.
+
+.. _Safe Derivation:
+
+Safe derivation
+~~~~~~~~~~~~~~~
+
+Whether a data pointer is stored, even briefly, as a raw pointer can affect the
+security-correctness of a program.  (Function pointers are never implicitly
+stored as raw pointers; raw pointers to functions can only be produced with the
+``<ptrauth.h>`` intrinsics.)  Repeated re-signing can also impact performance.
+Clang makes a modest set of guarantees in this area:
+
+- An expression of pointer type is said to be **safely derived** if:
+
+  - it takes the address of a global variable or function, or
+
+  - it is a load from a gl-value of ``__ptrauth``-qualified type, or
+
+  - it is a load from read-only memory that has been initialized from a safely
+    derived source, such as the `data const` section of a binary or library.
+
+- If a value that is safely derived is assigned to a ``__ptrauth``-qualified
+  object, including by initialization, then the value will be directly signed as
+  appropriate for the target qualifier and will not be stored as a raw pointer.
+
+- If the function expression of a call is a gl-value of ``__ptrauth``-qualified
+  type, then the call will be authenticated directly according to the source
+  qualifier and will not be resigned to the default rule for a function pointer
+  of its type.
+
+These guarantees are known to be inadequate for data pointer security. In
+particular, Clang should be enhanced to make the following guarantees:
+
+- A pointer should additionally be considered safely derived if it is:
+
+  - the address of a gl-value that is safely derived,
+
+  - the result of pointer arithmetic on a pointer that is safely derived (with
+    some restrictions on the integer operand),
+
+  - the result of a comma operator where the second operand is safely derived,
+
+  - the result of a conditional operator where the selected operand is safely
+    derived, or
+
+  - the result of loading from a safely derived gl-value.
+
+- A gl-value should be considered safely derived if it is:
+
+  - a dereference of a safely derived pointer,
+
+  - a member access into a safely derived gl-value, or
+
+  - a reference to a variable.
+
+- An access to a safely derived gl-value should be guaranteed to not allow
+  replacement of any of the safely-derived component values at any point in the
+  access.  "Access" should include loading a function pointer.
+
+- Assignments should include pointer-arithmetic operators like ``+=``.
+
+Making these guarantees will require further work, including significant new
+support in LLVM IR.
+
+Furthermore, Clang should implement a warning when assigning a data pointer that
+is not safely derived to a ``__ptrauth``-qualified gl-value.
+
+
+Language ABI
+------------
+
+This section describes the pointer-authentication ABI currently implemented in
+Clang for the Apple arm64e target.  As other targets adopt pointer
+authentication, this section should be generalized to express their ABIs as
+well.
+
+Key assignments
+~~~~~~~~~~~~~~~
+
+ARMv8.3 provides four abstract signing keys: ``IA``, ``IB``, ``DA``, and ``DB``.
+The architecture designates ``IA`` and ``IB`` for signing code pointers and
+``DA`` and ``DB`` for signing data pointers; this is reinforced by two
+properties:
+
+- The ISA provides instructions that perform combined auth+call and auth+load
+  operations; these instructions can only use the ``I`` keys and ``D`` keys,
+  respectively.
+
+- AArch64's TBI feature can be separately enabled for code pointers (controlling
+  whether indirect-branch instructions ignore those bits) and data pointers
+  (controlling whether memory-access instructions) ignore those bits. If TBI is
+  enabled for a kind of pointer, the sign and auth operations preserve the TBI
+  bits when signing with an associated keys (at the cost of shrinking the number
+  of available signing bits by 8).
+
+arm64e then further subdivides the keys as follows:
+
+- The ``A`` keys are used for primarily "global" purposes like signing v-tables
+  and function pointers.  These keys are sometimes called *process-independent*
+  or *cross-process* because on existing OSes they are not changed when changing
+  processes, although this is not a platform guarantee.
+
+- The ``B`` keys are used for primarily "local" purposes like signing return
+  addresses.  These keys are sometimes called *process-specific* because they
+  are typically different between processes. However, they are in fact shared
+  across processes in one situation: systems which provide ``fork`` cannot
+  change these keys in the child process; they can only be changed during
+  ``exec``.
+
+Implementation-defined algorithms and quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The cryptographic hash algorithm used to compute signatures in ARMv8.3 is a
+private detail of the hardware implementation.
+
+arm64e restricts constant discriminators (used in ``__ptrauth`` and
+``ptrauth_blend_discriminator``) to the range from 0 to 65535, inclusive.  A 0
+discriminator generally signifies that no blending is required; see the
+documentation for ``ptrauth_blend_discriminator``.  This range is somewhat
+narrow but has two advantages:
+
+- The AArch64 ISA allows an arbitrary 16-bit immediate to be written over the
+  top 16 bits of a register in a single instruction:
+
+  .. code-block:: asm
+
+    movk xN, #0x4849, LSL 48
+
+  This is ideal for the discriminator blending operation because it adds minimal
+  code-size overhead and avoids overwriting any interesting bits from the
+  pointer.  Blending in a wider constant discriminator would either clobber
+  interesting bits (e.g. if it was loaded with ``movk xN, #0x4c4f, LSL 32``) or
+  require significantly more code (e.g. if the discriminator was loaded with a
+  ``mov+bfi`` sequence).
+
+- It is possible to pack a 16-bit discriminator into loader metadata with
+  minimal compromises, whereas a wider discriminator would require extra
+  metadata storage and therefore significantly impact load times.
+
+The string hash used by ``ptrauth_string_discriminator`` is a 64-bit SipHash-2-4
+using the constant seed ``b5d4c9eb79104a796fec8b1b428781d4`` (big-endian), with
+the result reduced by modulo to the range of non-zero discriminators (i.e.
+``(rawHash % 65535) + 1``).
+
+Return addresses
+~~~~~~~~~~~~~~~~
+
+The kernel must ensure that attackers cannot replace LR due to an asynchronous
+exception; see `Register clobbering`_.  If this is done by generally protecting
+LR, then functions which don't spill LR to the stack can avoid signing it
+entirely.  Otherwise, the return address must be signed; on arm64e it is signed
+with the ``IB`` key using the stack pointer on entry as the discriminator.
+
+Protecting return addresses is of such particular importance that the ``IB`` key
+is almost entirely reserved for this purpose.
+
+Global offset tables
+~~~~~~~~~~~~~~~~~~~~
+
+The global offset table (GOT) is not part of the language ABI, but it is a
+common implementation technique for dynamic linking which deserves special
+discussion here.
+
+Whenever possible, signed pointers should be materialized directly in code
+rather than via the GOT, e.g. using an ``adrp+add+pac`` sequence on ARMv8.3.
+This decreases the amount of work necessary at load time to initialize the GOT,
+but more importantly, it defines away the potential for several attacks:
+
+- Attackers cannot change instructions, so there is no way to cause this code
+  sequence to materialize a different pointer, whereas an access via the GOT
+  always has *at minimum* a probabilistic chance to be the target of successful
+  `substitution attacks`_.
+
+- The GOT is a dense pool of fixed pointers at a fixed offset relative to code;
+  attackers can search this pool for useful pointers that can be used in
+  `substitution attacks`_, whereas pointers that are only materialized directly
+  are not so easily available.
+
+- Similarly, attackers can use `access path attacks`_ to replace a pointer to a
+  signed pointer with a pointer to the GOT if the signing schema used within the
+  GOT happens to be the same as the original pointer.  This kind of collision
+  becomes much less likely to be useful the fewer pointers are in the GOT in the
+  first place.
+
+If this can be done for a symbol, then the compiler need only ensure that it
+materializes the signed pointer using registers that are safe against
+`register clobbering`_.
+
+However, many symbols can only be accessed via the GOT, e.g. because they
+resolve to definitions outside of the current image.  In this case, care must
+be taken to ensure that using the GOT does not introduce weaknesses.
+
+- If the entire GOT can be mapped read-only after loading, then no signing is
+  required within the GOT.  In fact, not signing pointers in the GOT is
+  preferable in this case because it makes the GOT useless for the harvesting
+  and access-path attacks above.  Storing raw pointers in this way is usually
+  extremely unsafe, but for the special case of an immutable GOT entry it's fine
+  because the GOT is always accessed via an address that is directly
+  materialized in code and thus provably unattackable.  (But see `Remapping`_.)
+
+- Otherwise, GOT entries which are used for producing a signed pointer constant
+  must be signed.  The signing schema used in the GOT need not match the target
+  signing schema for the signed constant.  To counteract the threats of
+  substitution attacks, it's best if GOT entries can be signed with address
+  diversity.  Using a good constant discriminator as well (perhaps derived from
+  the symbol name) can make it less useful to use a pointer to the GOT as the
+  replacement in an :ref:`access path attack<Access path attacks>`.
+
+In either case, the compiler must ensure that materializing the address of a GOT
+entry as part of producing a signed pointer constant is not vulnerable to
+`register clobbering`_.  If the linker also generates code for this, e.g. for
+call stubs, this generated code must take the same precautions.
+
+Dynamic symbol lookup
+~~~~~~~~~~~~~~~~~~~~~
+
+On platforms that support dynamically loading or resolving symbols it is
+necessary for them to define the pointer authentication semantics of the APIs
+provided to perform such lookups. While the platform may choose to reply
+unsigned pointers from such function and rely on the caller performing the
+initial signing, doing so creates the opportunity for caller side errors that
+create :ref:`signing oracles<Signing Oracles>`.
+
+On arm64e the `dlsym` function is used to resolve a symbol at runtime. If the
+resolved symbol is a function or other code pointer the returned pointer is
+signed using the default function signing schema described in
+:ref:`C function pointers<C function abi>`. If the resolved symbol is not a code pointer it is
+returned as an unsigned pointer.
+
+.. _C function abi:
+
+C function pointers
+~~~~~~~~~~~~~~~~~~~
+
+On arm64e, C function pointers are currently signed with the ``IA`` key without
+address diversity and with a constant discriminator of 0.
+
+The C and C++ standards do not permit C function pointers to be signed with
+address diversity by default: in C++ terms, function pointer types are required
+to be trivially copyable, which means they must be copyable with ``memcpy``.
+
+The use of a uniform constant discriminator greatly simplifies the adoption of
+arm64e, but it is a significant weakness in the mitigation because it allows any
+C function pointer to be replaced with another. Clang supports
+`-fptrauth-function-pointer-type-discrimination`, which enables a variant ABI
+that uses type discrimination for function pointers. When generating the type
+based discriminator for a function type all primitive integer types are
+considered equivalent due to the prevalence of mismatching integer parameter
+types in real world code. Type discrimination of function pointers is
+ABI-incompatible with the standard arm64e ABI, but it can be used in constrained
+contexts such as embedded systems or in code that does not require function
+pointer interoperation with the standard ABI (e.g. because it does not pass
+function pointers back and forth, or only does so through
+``__ptrauth``-qualified l-values).
+
+C++ virtual tables
+~~~~~~~~~~~~~~~~~~
+
+By default the pointer to a C++ virtual table is currently signed with the
+``DA`` key, address diversity, and a constant discriminator equal to the string
+hash (see `ptrauth_string_discriminator`_) of the mangled v-table identifier
+of the primary base class for the v-table. To support existing code or ABI
+constraints it is possible to use the `ptrauth_vtable_pointer` attribute to
+override the schema used for the v-table pointer of the base type of
+polymorphic class hierarchy. This attribute permits the configuration of the
+key, address diversity mode, and any extra constant discriminator to be used.
+
+Virtual functions in a C++ virtual table are signed with the ``IA`` key, address
+diversity, and a constant discriminator equal to the string hash (see
+`ptrauth_string_discriminator`_) of the mangled name of the function which
+originally gave rise to the v-table slot.
+
+C++ dynamic_cast
+~~~~~~~~~~~~~~~~
+
+C++'s ``dynamic_cast`` presents a difficulty relative to other polymorphic
+languages that have a
+`top type <https://en.wikipedia.org/wiki/Any_type>` as the use of declaration
+diversity for v-table pointers results in distinct signing schemas for each
+isolated type hierarchy. As a result it is not possible for the Itanium ABI
+defined ``__dynamic_cast`` entry point to directly authenticate the v-table
+pointer of the provided object.
+
+The current implementation uses a forced authentication of the subject object's
+v-table prior to invoking ``__dynamic_cast`` to partially verify that the
+object's vtable is valid. The ``__dynamic_cast`` implementation currently relies
+on this caller side check to limit the substitutability of the v-table pointer
+with an incorrect or invalid v-table. The subsequent implementation of the
+dynamic cast algorithm is built on pointer auth protected ``type_info`` objects.
+
+In future a richer solution may be developed to support vending the correct
+authentication schema directly to the ``dynamic_cast`` implementation.
+
+C++ std::type_info v-table pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The v-table pointer of the ``std::type_info`` type is signed with the ``DA`` key
+and no additional diversity.
+
+C++ member function pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A member function pointer is signed with the ``IA`` key, no address diversity,
+and a constant discriminator equal to the string hash
+(see `ptrauth_string_discriminator`_) of the member pointer type.  Address
+diversity is not permitted by C++ for member function pointers because they must
+be trivially-copyable types.
+
+The Itanium C++ ABI specifies that member function pointers to virtual functions
+simply store an offset to the correct v-table slot.  This ABI cannot be used
+securely with pointer authentication because there is no safe place to store the
+constant discriminator for the target v-table slot: if it's stored with the
+offset, an attacker can simply overwrite it with the right discriminator for the
+offset.  Even if the programmer never uses pointers to virtual functions, the
+existence of this code path makes all member function pointer dereferences
+insecure.
+
+arm64e changes this ABI so that virtual function pointers are stored using
+dispatch thunks with vague linkage.  Because arm64e supports interoperation with
+``arm64`` code when pointer authentication is disabled, an arm64e member
+function pointer dereference still recognizes the virtual-function
+representation but uses an bogus discriminator on that path that should always
+trap if pointer authentication is enabled dynamically.
+
+The use of dispatch thunks means that ``==`` on member function pointers is no
+longer reliable for virtual functions, but this is acceptable because the
+standard makes no guarantees about it in the first place.
+
+The use of dispatch thunks also is required to support declaration specific
+authentication schemas for v-table pointers.
+
+C++ mangling
+~~~~~~~~~~~~
+
+When the ``__ptrauth`` qualifier appears in a C++ mangled name,
+it is mangled as a vendor qualifier with the signature
+``U9__ptrauthILj<key>ELb<addressDiscriminated>ELj<extraDiscriminator>EE``.
+
+e.g. ``int * __ptrauth(1, 0, 1234)`` will be mangled as
+``U9__ptrauthILj1ELb0ELj1234EE``.
+
+If the vtable pointer authentication scheme of a polymorphic class is overridden
+we mangle the override information with the vendor qualifier
+``__vtptrauth(int key, bool addressDiscriminated, unsigned extraDiscriminator)``,
+where the extra discriminator is the explicit value the specified discrimination
+mode evalutes to.
+
+Blocks
+~~~~~~
+
+Block pointers are data pointers which must interoperate with the ObjC `id` type
+and therefore cannot be signed themselves. As blocks conform to the ObjC `id`
+type, they contain an ``isa`` pointer signed as described
+:ref:`below<Objc isa and super>`.
+
+The invocation pointer in a block is signed with the ``IA`` key using address
+diversity and a constant dicriminator of 0.  Using a uniform discriminator is
+seen as a weakness to be potentially improved, but this is tricky due to the
+subtype polymorphism directly permitted for blocks.
+
+Block descriptors and ``__block`` variables can contain pointers to functions
+that can be used to copy or destroy the object.  These functions are signed with
+the ``IA`` key, address diversity, and a constant discriminator of 0.  The
+structure of block descriptors is under consideration for improvement.
+
+Objective-C runtime
+~~~~~~~~~~~~~~~~~~~
+
+In addition to the compile time ABI design, the Objective-C runtime provides
+additional protection to methods and other metadata that have been loaded into
+the Objective-C method cache; this protection is private to the runtime.
+
+Objective-C methods
+~~~~~~~~~~~~~~~~~~~
+
+Objective-C method lists sign methods with the ``IA`` key using address
+diversity and a constant discriminator of 0.  Using a uniform constant
+discriminator is believed to be acceptable because these tables are only
+accessed internally to the Objective-C runtime.
+
+Objective-C class method list pointer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The method list pointer in Objective-C classes are signed with the ``DA`` key
+using address diversity, and a constant discriminator of 0xC310.
+
+Objective-C class read-only data pointer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The read-only data pointer in Objective-C classes are signed with the ``DA`` key
+using address diversity, and a constant discriminator of 0x61F8.
+
+.. _Objc isa and super:
+
+Objective-C ``isa`` and ``super`` pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An Objective-C object's ``isa`` and ``super`` pointers are both signed with
+the ``DA`` key using address diversity and constant discriminators of 0x6AE1
+and 0x25DA respectively.
+
+Objective-C ``SEL`` pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, the type of an Objective-C instance variable of type ``SEL``, when
+the qualifiers do not include an explicit ``__ptrauth`` qualifier, is adjusted
+to be qualified with ``__ptrauth(ptrauth_key_asdb, 1, 0x57C2)``.
+
+This provides a measure of implicit at-rest protection to  Objective-C classes
+that store selectors, as in the common target-action design pattern. This
+prevents attackers from overriding the selector to invoke an arbitrary different
+method, which is a major attack vector in Objective-C. Since ``SEL`` values are
+not normally passed around as signed pointers, there is a
+:ref:`signing oracle<Signing Oracles>` associated with the initialization of the
+ivar, but the use of address and constant diversity limit the risks.
+
+The implicit qualifier means that the type of the ivar does not match its
+declaration, which can cause type errors if the address of the ivar is taken:
+
+.. code-block:: ObjC
+
+  @interface A : NSObject {
+    SEL _s;
+  }
+  @end
+
+  void f(SEL *);
+
+  @implementation A
+  -(void)g
+  {
+     f(&_s);
+  }
+  @end
+
+To fix such an mismatch the schema macro from `<ptrauth.h>`:
+
+.. code-block:: ObjC
+
+  #include <ptrauth.h>
+
+  void f(SEL __ptrauth_objc_sel*);
 
+or less safely, and introducing the possibility of an
+:ref:`signing or authentication oracle<Signing oracles>`, an unauthencaticated
+temporary may be used as intermediate storage.
 
-Alternative Implementations
+Alternative implementations
 ---------------------------
 
-Signature Storage
+Signature storage
 ~~~~~~~~~~~~~~~~~
 
 It is not critical for the security of pointer authentication that the
@@ -536,7 +1667,7 @@ Storing the signature in the high bits, as Armv8.3 does, has several trade-offs:
   return signed pointers.  This means that clients of these APIs will not
   require insecure code in order to correctly receive a function pointer.
 
-Hashing vs. Encrypting Pointers
+Hashing vs. encrypting pointers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Armv8.3 implements ``sign`` by computing a cryptographic hash and storing that
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bd7a4b20242f..9400be296e7c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -55,7 +55,7 @@ C/C++ Language Potentially Breaking Changes
   case for old-style offsetof idioms like ``((int)(&(((struct S *)0)->field)))``, to
   ensure they are not caught by these optimizations.  It is also possible to use
   ``-fwrapv-pointer`` or   ``-fno-delete-null-pointer-checks`` to make pointer arithmetic
-  on null pointers well-defined. (#GH130734, #GH130742, #GH130952)
+  on null pointers well-defined. (#GH130734, #GH130952)
 
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
@@ -123,6 +123,8 @@ C++ Language Changes
   a perfect match (all conversion sequences are identity conversions) template candidates are not instantiated.
   Diagnostics that would have resulted from the instantiation of these template candidates are no longer
   produced. This aligns Clang closer to the behavior of GCC, and fixes (#GH62096), (#GH74581), and (#GH74581).
+- Implemented `P2719R5 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_
+  as an extension in all C++ language modes.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -359,6 +361,12 @@ Non-comprehensive list of changes in this release
   ARC-managed pointers and other pointer types. The prior behavior was overly
   strict and inconsistent with the ARC specification.
 
+- Use of ``__has_feature`` to detect the ``ptrauth_qualifier`` and ``ptrauth_intrinsics``
+  features has been deprecated, and is restricted to the arm64e target only. The
+  correct method to check for these features is to test for the ``__PTRAUTH__``
+  macro.
+
+
 New Compiler Flags
 ------------------
 
@@ -376,6 +384,13 @@ New Compiler Flags
 
 - New options ``-g[no-]key-instructions`` added, disabled by default. Reduces jumpiness of debug stepping for optimized code in some debuggers (not LLDB at this time). Not recommended for use without optimizations. DWARF only. Note both the positive and negative flags imply ``-g``.
 
+- New options ``-fthinlto-distributor=`` and ``-Xthinlto-distributor=`` added for Integrated Distributed ThinLTO (DTLTO). DTLTO enables the distribution of backend ThinLTO compilations via external distribution systems, such as Incredibuild, during the traditional link step. (#GH147265, `ThinLTODocs <https://clang.llvm.org/docs/ThinLTO.html#integrated-distributed-thinlto-dtlto>`_).
+
+- A new flag - `-static-libclosure` was introduced to support statically linking
+  the runtime for the Blocks extension on Windows. This flag currently only
+  changes the code generation, and even then, only on Windows. This does not
+  impact the linker behaviour like the other `-static-*` flags.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -674,7 +689,7 @@ Improvements to Clang's diagnostics
   #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
   #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
   #GH36703, #GH32903, #GH23312, #GH69874.
-  
+
 - Clang no longer emits a spurious -Wdangling-gsl warning in C++23 when
   iterating over an element of a temporary container in a range-based
   for loop.(#GH109793, #GH145164)
@@ -800,6 +815,8 @@ Bug Fixes in This Version
   declaration statements. Clang now emits a warning for these patterns. (#GH141659)
 - Fixed false positives for redeclaration errors of using enum in
   nested scopes. (#GH147495)
+- Fixed a crash in `clang-scan-deps` when a module with the same name is found
+  in different locations (#GH134404, #GH146976).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -949,7 +966,6 @@ Bug Fixes to C++ Support
 - Fixed an access checking bug when initializing non-aggregates in default arguments (#GH62444), (#GH83608)
 - Fixed a pack substitution bug in deducing class template partial specializations. (#GH53609)
 - Fixed a crash when constant evaluating some explicit object member assignment operators. (#GH142835)
-- Fixed an access checking bug when substituting into concepts (#GH115838)
 - Fix a bug where private access specifier of overloaded function not respected. (#GH107629)
 - Correctly handles calling an explicit object member function template overload set
   through its address (``(&Foo::bar<baz>)()``).
@@ -967,6 +983,7 @@ Bug Fixes to C++ Support
 - Fix a crash with NTTP when instantiating local class.
 - Fixed a crash involving list-initialization of an empty class with a
   non-empty initializer list. (#GH147949)
+- Fixed constant evaluation of equality comparisons of constexpr-unknown references. (#GH147663)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -993,6 +1010,19 @@ Miscellaneous Clang Crashes Fixed
 OpenACC Specific Changes
 ------------------------
 
+- OpenACC support, enabled via `-fopenacc` has reached a level of completeness
+  to finally be at least notionally usable. Currently, the OpenACC 3.4
+  specification has been completely implemented for Sema and AST creation, so
+  nodes will show up in the AST after having been properly checked. Lowering is
+  currently a work in progress, with compute, loop, and combined constructs
+  partially implemented, plus a handful of data and executable constructs
+  implemented. Lowering will only work in Clang-IR mode (so only with a compiler
+  built with Clang-IR enabled, and with `-fclangir` used on the command line).
+  However, note that the Clang-IR implementation status is also quite partial,
+  so frequent 'not yet implemented' diagnostics should be expected.  Also, the
+  ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
+  code generation is possible for OpenACC.
+
 Target Specific Changes
 -----------------------
 
@@ -1071,6 +1101,8 @@ Windows Support
   extensions are enabled. This allows for properly processing ``intsafe.h`` in
   the Windows SDK.
 
+- Clang now supports `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
@@ -1119,9 +1151,19 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
+PowerPC Support
+^^^^^^^^^^^^^^^
+
+* Add `__dmr1024` type for Dense Math Facility.
+* Add prototype for Dense Math Facility integer calculation builtins.
+
 AIX Support
 ^^^^^^^^^^^
 
+* Fixed `-print-runtime-dir` to fallback to the target subdirectory (rather than OS subdirectory) if the runtime path is not found.
+* Fixed `-print-runtime-dir` to find the correct runtime path if the triple has "unknown" as the environment component.
+* Changed AIX targets to use the per-target runtime directories for compiler runtimes (i.e. `lib/clang/20/lib/aix` became `lib/clang/21/lib/powerpc-ibm-aix` and `clang/21/lib/powerpc64-ibm-aix`).
+
 NetBSD Support
 ^^^^^^^^^^^^^^
 
@@ -1192,53 +1234,112 @@ Code Completion
 
 Static Analyzer
 ---------------
-- Fixed a crash when C++20 parenthesized initializer lists are used. This issue
-  was causing a crash in clang-tidy. (#GH136041)
 
 New features
 ^^^^^^^^^^^^
 
-- A new flag - `-static-libclosure` was introduced to support statically linking
-  the runtime for the Blocks extension on Windows. This flag currently only
-  changes the code generation, and even then, only on Windows. This does not
-  impact the linker behaviour like the other `-static-*` flags.
-- OpenACC support, enabled via `-fopenacc` has reached a level of completeness
-  to finally be at least notionally usable. Currently, the OpenACC 3.4
-  specification has been completely implemented for Sema and AST creation, so
-  nodes will show up in the AST after having been properly checked. Lowering is
-  currently a work in progress, with compute, loop, and combined constructs
-  partially implemented, plus a handful of data and executable constructs
-  implemented. Lowering will only work in Clang-IR mode (so only with a compiler
-  built with Clang-IR enabled, and with `-fclangir` used on the command line).
-  However, note that the Clang-IR implementation status is also quite partial,
-  so frequent 'not yet implemented' diagnostics should be expected.  Also, the
-  ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
-  code generation is possible for OpenACC.
-- Implemented `P2719R5 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_
-  as an extension in all C++ language modes.
+- Added support for the ``[[clang::assume(cond)]]`` attribute, treating it as
+  ``__builtin_assume(cond)`` for better static analysis. (#GH129234)
+
+- Introduced per-entry-point statistics to provide more detailed analysis metrics.
+  Documentation: :doc:`analyzer/developer-docs/Statistics` (#GH131175)
+
+- Added time-trace scopes for high-level analyzer steps to improve performance
+  debugging. Documentation: :doc:`analyzer/developer-docs/PerformanceInvestigation`
+  (#GH125508, #GH125884)
 
+- Enhanced the ``check::BlockEntrance`` checker callback to provide more granular
+  control over block-level analysis.
+  `Documentation (check::BlockEntrance)
+  <https://clang.llvm.org/doxygen/CheckerDocumentation_8cpp_source.html>`_
+  (#GH140924)
+
+- Added a new checker ``core.FixedAddressDereference`` to detect dereferences
+  of fixed addresses, which can be useful for finding hard-coded memory
+  accesses. (#GH127191, #GH132404)
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
-- Fixed a crash in ``UnixAPIMisuseChecker`` and ``MallocChecker`` when analyzing
+- Fixed a crash when C++20 parenthesized initializer lists are used.
+  This affected a crash of the well-known lambda overloaded pattern.
+  (#GH136041, #GH135665)
+
+- Dropped an unjustified assertion, that was triggered in ``BugReporterVisitors.cpp``
+  for variable initialization detection. (#GH125044)
+
+- Fixed a crash in ``unix.API`` and ``unix.Malloc`` when analyzing
   code with non-standard ``getline`` or ``getdelim`` function signatures. (#GH144884)
 
+- Fixed crashes involving ``__builtin_bit_cast``. (#GH139188)
+
+- ``__datasizeof`` (C++) and ``_Countof`` (C) no longer cause a failed assertion
+  when given an operand of VLA type. (#GH151711)
+
+- Fixed a crash in ``alpha.core.CastSize``. (#GH134387)
+
+- Some ``cplusplus.PlacementNew`` false positives were fixed. (#GH150161)
+
 Improvements
 ^^^^^^^^^^^^
 
+- Added option to assume at least one iteration in loops to reduce false positives.
+  (#GH125494)
+
 - The checker option ``optin.cplusplus.VirtualCall:PureOnly`` was removed,
-  because it had been deprecated since 2019 and it is completely useless (it
-  was kept only for compatibility with pre-2019 versions, setting it to true is
-  equivalent to completely disabling the checker).
+  because it had been deprecated since 2019. (#GH131823)
+
+- Enhanced the ``core.StackAddressEscape`` to detect more cases of stack address
+  escapes, including return values for child stack frames. (#GH126620, #GH126986)
+
+- Improved the ``unix.BlockInCriticalSection`` to recognize ``O_NONBLOCK``
+  streams and suppress reports in those cases. (#GH127049)
+
+- Better support for lambda-converted function pointers in analysis. (#GH144906)
+
+- Improved modeling of ``getcwd`` function in ``unix.StdCLibraryFunctions`` checker.
+  (#GH141076)
+
+- Enhanced the ``optin.core.EnumCastOutOfRange`` checker to ignore ``[[clang::flag_enum]]``
+  enums. (#GH141232)
+
+- Improved handling of structured bindings captured by lambdas. (#GH132579, #GH91835)
+
+- Fixed unnamed bitfield handling in ``optin.cplusplus.UninitializedObject``. (#GH132427, #GH132001)
+
+- Enhanced iterator checker modeling for ``insert`` operations. (#GH132596)
+
+- Improved ``format`` attribute handling in ``optin.taint.GenericTaint``. (#GH132765)
+
+- Added support for ``consteval`` in ``ConditionBRVisitor::VisitTerminator``.
+  (#GH146859, #GH139130)
+
+- C standard streams are no longer invalidated by all C library function calls.
+  (#GH147766)
+
+- Enhanced store management with region-store-binding-limit to improve performance.
+  See `region-store-max-binding-fanout
+  <https://clang.llvm.org/docs/analyzer/user-docs/Options.html#region-store-max-binding-fanout>`_
+  config option. Overriding these options are discouraged, unless you know what you do.
+  (#GH127602)
+
+- Updated undefined assignment checker (``core.uninitialized.Assign``) diagnostics
+  to avoid using the term ``garbage``. (#GH126596)
+
+- Fixed false memory leak reports involving placement new. (#GH144341)
+
+- Avoided unnecessary super region invalidation in ``unix.cstring.*`` checkers.
+  (#GH146212, #GH143807)
+
+- Enhanced handling of tainted division-by-zero error paths in the
+  ``optin.taint.TaintedDiv`` checker. (#GH144491)
 
 Moved checkers
 ^^^^^^^^^^^^^^
 
-- After lots of improvements, the checker ``alpha.security.ArrayBoundV2`` is
+- After lots of improvements, the checker ``alpha.security.ArrayBoundV2`` was
   renamed to ``security.ArrayBound``. As this checker is stable now, the old
-  checker ``alpha.security.ArrayBound`` (which was searching for the same kind
-  of bugs with an different, simpler and less accurate algorithm) is removed.
+  checker ``alpha.security.ArrayBound`` was removed.
 
 .. _release-notes-sanitizers:
 
diff --git a/clang/docs/ThinLTO.rst b/clang/docs/ThinLTO.rst
index c04254767891..8cb3e0b2b0d1 100644
--- a/clang/docs/ThinLTO.rst
+++ b/clang/docs/ThinLTO.rst
@@ -240,6 +240,53 @@ The ``BOOTSTRAP_LLVM_ENABLE_LTO=Thin`` will enable ThinLTO for stage 2 and
 stage 3 in case the compiler used for stage 1 does not support the ThinLTO
 option.
 
+Integrated Distributed ThinLTO (DTLTO)
+--------------------------------------
+
+Integrated Distributed ThinLTO (DTLTO) enables the distribution of backend
+ThinLTO compilations via external distribution systems, such as Incredibuild,
+during the traditional link step.
+
+The implementation is documented here: https://llvm.org/docs/DTLTO.html.
+
+Command-Line Options
+^^^^^^^^^^^^^^^^^^^^
+
+DTLTO requires the LLD linker (``-fuse-ld=lld``).
+
+``-fthinlto-distributor=<path>``
+   - Specifies the ``<path>`` to the distributor process executable for DTLTO.
+   - If specified, ThinLTO backend compilations will be distributed by LLD.
+
+``-Xthinlto-distributor=<arg>``
+   - Passes ``<arg>`` to the distributor process (see ``-fthinlto-distributor=``).
+   - Can be specified multiple times to pass multiple options.
+   - Multiple options can also be specified by separating them with commas.
+
+If ``-fthinlto-distributor=`` is specified, Clang supplies the path to a
+compiler to be executed remotely to perform the ThinLTO backend
+compilations. Currently, this is Clang itself.
+
+Usage
+^^^^^
+
+Compilation is unchanged from ThinLTO. DTLTO options need to supplied for the link step:
+
+.. code-block:: console
+
+  % clang -flto=thin -fthinlto-distributor=distribute.sh -Xthinlto-distributor=--verbose,--j10 -fuse-ld=lld file1.o file2.o
+  % clang -flto=thin -fthinlto-distributor=$(which python) -Xthinlto-distributor=distribute.py -fuse-ld=lld file1.o file2.o
+
+When using lld-link:
+
+.. code-block:: console
+
+  % lld-link /out:a.exe file1.obj file2.obj /thinlto-distributor:distribute.exe /thinlto-remote-compiler:${LLVM}\bin\clang.exe /thinlto-distributor-arg:--verbose
+
+Note that currently, DTLTO is only supported in some LLD flavors. Support can
+be added to other LLD flavours in the future.
+See `DTLTO <https://lld.llvm.org/DTLTO.html>`_ for more information.
+
 More Information
 ================
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index c35311c88641..b929585205ae 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -6953,6 +6953,21 @@ clang_getCursorUnaryOperatorKind(CXCursor cursor);
  * @}
  */
 
+CINDEX_DEPRECATED
+typedef void *CXRemapping;
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping clang_getRemappings(const char *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping
+clang_getRemappingsFromFileList(const char **, unsigned);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE unsigned clang_remap_getNumFiles(CXRemapping);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void
+clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void clang_remap_dispose(CXRemapping);
+
 LLVM_CLANG_C_EXTERN_C_END
 
 #endif
diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 9999a30c51ad..cb942ea865e2 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -143,7 +143,7 @@ class APValue {
     AddrLabelDiff
   };
 
-  class LValueBase {
+  class alignas(uint64_t) LValueBase {
     typedef llvm::PointerUnion<const ValueDecl *, const Expr *, TypeInfoLValue,
                                DynamicAllocLValue>
         PtrTy;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index fefdaba7f8bf..76747d2b1181 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -9417,9 +9417,9 @@ def NonStringDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 The ``nonstring`` attribute can be applied to the declaration of a variable or
-a field whose type is a character array to specify that the character array is
-not intended to behave like a null-terminated string. This will silence
-diagnostics with code like:
+a field whose type is a character pointer or character array to specify that
+the buffer is not intended to behave like a null-terminated string. This will
+silence diagnostics with code like:
 
 .. code-block:: c
 
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index a67b9995d3b5..4c7219c78c8b 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -507,6 +507,14 @@ def note_odr_number_of_bases : Note<
   "class has %0 base %plural{1:class|:classes}0">;
 def note_odr_enumerator : Note<"enumerator %0 with value %1 here">;
 def note_odr_missing_enumerator : Note<"no corresponding enumerator here">;
+def note_odr_incompatible_fixed_underlying_type : Note<
+  "enumeration %0 declared with incompatible fixed underlying types (%1 vs. "
+  "%2)">;
+def note_odr_fixed_underlying_type : Note<
+  "enumeration %0 has fixed underlying type here">;
+def note_odr_missing_fixed_underlying_type : Note<
+  "enumeration %0 missing fixed underlying type here">;
+
 def err_odr_field_type_inconsistent : Error<
   "field %0 declared with incompatible types in different "
   "translation units (%1 vs. %2)">;
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 72f23614aef1..b040181beaff 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -147,14 +147,17 @@ FEATURE(type_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Type))
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
-FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics)
-EXTENSION(ptrauth_qualifier, LangOpts.PointerAuthIntrinsics)
+FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics &&
+                            PP.getTargetInfo().getTriple().isOSDarwin())
+FEATURE(ptrauth_qualifier, LangOpts.PointerAuthIntrinsics &&
+                           PP.getTargetInfo().getTriple().isOSDarwin())
 FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns)
 FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination)
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
 FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_signed_block_descriptors, LangOpts.PointerAuthBlockDescriptorPointers)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
 FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
@@ -163,7 +166,7 @@ FEATURE(ptrauth_elf_got, LangOpts.PointerAuthELFGOT)
 
 FEATURE(ptrauth_objc_isa, LangOpts.PointerAuthObjcIsa)
 FEATURE(ptrauth_objc_interface_sel, LangOpts.PointerAuthObjcInterfaceSel)
-FEATURE(ptrauth_objc_signable_class, true)
+FEATURE(ptrauth_objc_signable_class, LangOpts.PointerAuthIntrinsics)
 FEATURE(ptrauth_objc_method_list_pointer, LangOpts.PointerAuthCalls)
 
 EXTENSION(swiftcc,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 0d546cb3b847..25f4575a5425 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -138,6 +138,8 @@ LANGOPT(PointerAuthObjcInterfaceSel, 1, 0, NotCompatible, "authentication of SEL
 LANGOPT(PointerAuthObjcInterfaceSelKey, 16, 0, NotCompatible, "authentication key for SEL fields of ObjC interfaces")
 LANGOPT(PointerAuthObjcClassROPointers, 1, 0, Benign, "class_ro_t pointer authentication")
 
+LANGOPT(PointerAuthBlockDescriptorPointers, 1, 0, NotCompatible, "enable signed block descriptors")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, NotCompatible, "'[[]]' attributes extension for all language standard modes")
 LANGOPT(ExperimentalLateParseAttributes, 1, 0, NotCompatible, "experimental late parsing of attributes")
 
@@ -496,6 +498,8 @@ LANGOPT(CheckConstexprFunctionBodies, 1, 1, Benign,
 
 LANGOPT(BoundsSafety, 1, 0, NotCompatible, "Bounds safety extension for C")
 
+LANGOPT(EnableLifetimeSafety, 1, 0, NotCompatible, "Experimental lifetime safety analysis for C++")
+
 LANGOPT(PreserveVec3Type, 1, 0, NotCompatible, "Preserve 3-component vector type")
 
 #undef LANGOPT
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index fb6dddf3ae9c..2b920250721f 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -23,6 +23,10 @@
 
 namespace clang {
 
+/// Constant discriminator to be used with block descriptor pointers. The value
+/// is ptrauth_string_discriminator("block_descriptor")
+constexpr uint16_t BlockDescriptorConstantDiscriminator = 0xC0BB;
+
 /// Constant discriminator to be used with function pointers in .init_array and
 /// .fini_array. The value is ptrauth_string_discriminator("init_fini")
 constexpr uint16_t InitFiniPointerConstantDiscriminator = 0xD9D4;
@@ -223,6 +227,18 @@ struct PointerAuthOptions {
   /// The ABI for function addresses in .init_array and .fini_array
   PointerAuthSchema InitFiniPointers;
 
+  /// The ABI for block invocation function pointers.
+  PointerAuthSchema BlockInvocationFunctionPointers;
+
+  /// The ABI for block object copy/destroy function pointers.
+  PointerAuthSchema BlockHelperFunctionPointers;
+
+  /// The ABI for __block variable copy/destroy function pointers.
+  PointerAuthSchema BlockByrefHelperFunctionPointers;
+
+  /// The ABI for pointers to block descriptors.
+  PointerAuthSchema BlockDescriptorPointers;
+
   /// The ABI for Objective-C method lists.
   PointerAuthSchema ObjCMethodListFunctionPointers;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e7c7e9d93fe2..958d0d05ade2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -990,6 +990,13 @@ def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
   Visibility<[ClangOption, CLOption, FlangOption]>,
   HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
+def Xthinlto_distributor_EQ : CommaJoined<["-"], "Xthinlto-distributor=">,
+  Flags<[LinkOption]>,
+  Visibility<[ClangOption, CLOption]>,
+  HelpText<"Pass <arg> to the ThinLTO distributor process. Can be specified "
+           "multiple times or with comma-separated values.">,
+  MetaVarName<"<arg>">,
+  Group<Link_Group>;
 def Xoffload_linker : JoinedAndSeparate<["-"], "Xoffload-linker">,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Pass <arg> to the offload linkers or the ones identified by -<triple>">,
@@ -1910,6 +1917,14 @@ defm bounds_safety : BoolFOption<
   BothFlags<[], [CC1Option],
           " experimental bounds safety extension for C">>;
 
+defm lifetime_safety : BoolFOption<
+  "experimental-lifetime-safety",
+  LangOpts<"EnableLifetimeSafety">, DefaultFalse,
+  PosFlag<SetTrue, [], [CC1Option], "Enable">,
+  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+  BothFlags<[], [CC1Option],
+          " experimental lifetime safety for C++">>;
+
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Emit">,
@@ -4249,7 +4264,12 @@ def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
 def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
   HelpText<"Do not assume that any loop is finite.">,
   Visibility<[ClangOption, CC1Option]>;
-
+def fthinlto_distributor_EQ : Joined<["-"], "fthinlto-distributor=">,
+  Group<f_Group>,
+  HelpText<"Path to the ThinLTO distributor process. If specified, "
+           "ThinLTO backend compilations will be distributed by LLD">,
+  MetaVarName<"<path>">,
+  Visibility<[ClangOption, CLOption]>;
 def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
   HelpText<"Process trigraph sequences">, Visibility<[ClangOption, CC1Option]>;
 def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
@@ -4507,6 +4527,7 @@ defm aarch64_jump_table_hardening: OptInCC1FFlag<"aarch64-jump-table-hardening",
 defm ptrauth_objc_isa : OptInCC1FFlag<"ptrauth-objc-isa", "Enable signing and authentication of Objective-C object's 'isa' field">;
 defm ptrauth_objc_interface_sel : OptInCC1FFlag<"ptrauth-objc-interface-sel", "Enable signing and authentication of Objective-C object's 'SEL' fields">;
 defm ptrauth_objc_class_ro : OptInCC1FFlag<"ptrauth-objc-class-ro", "Enable signing and authentication for ObjC class_ro pointers">;
+defm ptrauth_block_descriptor_pointers : OptInCC1FFlag<"ptrauth-block-descriptor-pointers", "Enable signing and authentication of block descriptors">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b4f2a87fe7e8..f0d0000c42a9 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4694,6 +4694,13 @@ struct FormatStyle {
     ///      <conditional-body>                     <conditional-body>
     /// \endcode
     bool AfterIfMacros;
+    /// If ``true``, put a space between alternative operator ``not`` and the
+    /// opening parenthesis.
+    /// \code
+    ///    true:                                  false:
+    ///    return not (a || b);            vs.    return not(a || b);
+    /// \endcode
+    bool AfterNot;
     /// If ``true``, put a space between operator overloading and opening
     /// parentheses.
     /// \code
@@ -4742,9 +4749,9 @@ struct FormatStyle {
         : AfterControlStatements(false), AfterForeachMacros(false),
           AfterFunctionDeclarationName(false),
           AfterFunctionDefinitionName(false), AfterIfMacros(false),
-          AfterOverloadedOperator(false), AfterPlacementOperator(true),
-          AfterRequiresInClause(false), AfterRequiresInExpression(false),
-          BeforeNonEmptyParentheses(false) {}
+          AfterNot(false), AfterOverloadedOperator(false),
+          AfterPlacementOperator(true), AfterRequiresInClause(false),
+          AfterRequiresInExpression(false), BeforeNonEmptyParentheses(false) {}
 
     bool operator==(const SpaceBeforeParensCustom &Other) const {
       return AfterControlStatements == Other.AfterControlStatements &&
@@ -4753,6 +4760,7 @@ struct FormatStyle {
                  Other.AfterFunctionDeclarationName &&
              AfterFunctionDefinitionName == Other.AfterFunctionDefinitionName &&
              AfterIfMacros == Other.AfterIfMacros &&
+             AfterNot == Other.AfterNot &&
              AfterOverloadedOperator == Other.AfterOverloadedOperator &&
              AfterPlacementOperator == Other.AfterPlacementOperator &&
              AfterRequiresInClause == Other.AfterRequiresInClause &&
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 06971ff87ab9..423f2ffe2f85 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -143,9 +143,6 @@ class Lexer : public PreprocessorLexer {
   /// True if this is the first time we're lexing the input file.
   bool IsFirstTimeLexingFile;
 
-  /// True if current lexing token is the first pp-token.
-  bool IsFirstPPToken;
-
   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
   // it also points to '\n.'
   const char *NewLinePtr;
diff --git a/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h
new file mode 100644
index 000000000000..9ab3c6a528a1
--- /dev/null
+++ b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h
@@ -0,0 +1,310 @@
+//===--- NoTrivialPPDirectiveTracer.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the NoTrivialPPDirectiveTracer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
+#define LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
+
+#include "clang/Lex/PPCallbacks.h"
+
+namespace clang {
+class Preprocessor;
+
+/// Consider the following code:
+///
+/// # 1 __FILE__ 1 3
+/// export module a;
+///
+/// According to the wording in
+/// [P1857R3](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1857r3.html):
+///
+///   A module directive may only appear as the first preprocessing tokens in a
+///   file (excluding the global module fragment.)
+///
+/// and the wording in
+/// [[cpp.pre]](https://eel.is/c++draft/cpp.pre#nt:module-file):
+///   module-file:
+///     pp-global-module-fragment[opt] pp-module group[opt]
+///     pp-private-module-fragment[opt]
+///
+/// `#` is the first pp-token in the translation unit, and it was rejected by
+/// clang, but they really should be exempted from this rule. The goal is to not
+/// allow any preprocessor conditionals or most state changes, but these don't
+/// fit that.
+///
+/// State change would mean most semantically observable preprocessor state,
+/// particularly anything that is order dependent. Global flags like being a
+/// system header/module shouldn't matter.
+///
+/// We should exempt a brunch of directives, even though it violates the current
+/// standard wording.
+///
+/// This class used to trace 'no-trivial' pp-directives in main file, which may
+/// change the preprocessing state.
+///
+/// FIXME: Once the wording of the standard is revised, we need to follow the
+/// wording of the standard. Currently this is just a workaround
+class NoTrivialPPDirectiveTracer : public PPCallbacks {
+  Preprocessor &PP;
+
+  /// Whether preprocessing main file. We only focus on the main file.
+  bool InMainFile = true;
+
+  /// Whether one or more conditional, include or other 'no-trivial'
+  /// pp-directives has seen before.
+  bool SeenNoTrivialPPDirective = false;
+
+  void setSeenNoTrivialPPDirective();
+
+public:
+  NoTrivialPPDirectiveTracer(Preprocessor &P) : PP(P) {}
+
+  bool hasSeenNoTrivialPPDirective() const;
+
+  /// Callback invoked whenever the \p Lexer moves to a different file for
+  /// lexing. Unlike \p FileChanged line number directives and other related
+  /// pragmas do not trigger callbacks to \p LexedFileChanged.
+  ///
+  /// \param FID The \p FileID that the \p Lexer moved to.
+  ///
+  /// \param Reason Whether the \p Lexer entered a new file or exited one.
+  ///
+  /// \param FileType The \p CharacteristicKind of the file the \p Lexer moved
+  /// to.
+  ///
+  /// \param PrevFID The \p FileID the \p Lexer was using before the change.
+  ///
+  /// \param Loc The location where the \p Lexer entered a new file from or the
+  /// location that the \p Lexer moved into after exiting a file.
+  void LexedFileChanged(FileID FID, LexedFileChangeReason Reason,
+                        SrcMgr::CharacteristicKind FileType, FileID PrevFID,
+                        SourceLocation Loc) override;
+
+  /// Callback invoked whenever an embed directive has been processed,
+  /// regardless of whether the embed will actually find a file.
+  ///
+  /// \param HashLoc The location of the '#' that starts the embed directive.
+  ///
+  /// \param FileName The name of the file being included, as written in the
+  /// source code.
+  ///
+  /// \param IsAngled Whether the file name was enclosed in angle brackets;
+  /// otherwise, it was enclosed in quotes.
+  ///
+  /// \param File The actual file that may be included by this embed directive.
+  ///
+  /// \param Params The parameters used by the directive.
+  void EmbedDirective(SourceLocation HashLoc, StringRef FileName, bool IsAngled,
+                      OptionalFileEntryRef File,
+                      const LexEmbedParametersResult &Params) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked whenever an inclusion directive of
+  /// any kind (\c \#include, \c \#import, etc.) has been processed, regardless
+  /// of whether the inclusion will actually result in an inclusion.
+  ///
+  /// \param HashLoc The location of the '#' that starts the inclusion
+  /// directive.
+  ///
+  /// \param IncludeTok The token that indicates the kind of inclusion
+  /// directive, e.g., 'include' or 'import'.
+  ///
+  /// \param FileName The name of the file being included, as written in the
+  /// source code.
+  ///
+  /// \param IsAngled Whether the file name was enclosed in angle brackets;
+  /// otherwise, it was enclosed in quotes.
+  ///
+  /// \param FilenameRange The character range of the quotes or angle brackets
+  /// for the written file name.
+  ///
+  /// \param File The actual file that may be included by this inclusion
+  /// directive.
+  ///
+  /// \param SearchPath Contains the search path which was used to find the file
+  /// in the file system. If the file was found via an absolute include path,
+  /// SearchPath will be empty. For framework includes, the SearchPath and
+  /// RelativePath will be split up. For example, if an include of "Some/Some.h"
+  /// is found via the framework path
+  /// "path/to/Frameworks/Some.framework/Headers/Some.h", SearchPath will be
+  /// "path/to/Frameworks/Some.framework/Headers" and RelativePath will be
+  /// "Some.h".
+  ///
+  /// \param RelativePath The path relative to SearchPath, at which the include
+  /// file was found. This is equal to FileName except for framework includes.
+  ///
+  /// \param SuggestedModule The module suggested for this header, if any.
+  ///
+  /// \param ModuleImported Whether this include was translated into import of
+  /// \p SuggestedModule.
+  ///
+  /// \param FileType The characteristic kind, indicates whether a file or
+  /// directory holds normal user code, system code, or system code which is
+  /// implicitly 'extern "C"' in C++ mode.
+  ///
+  void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
+                          StringRef FileName, bool IsAngled,
+                          CharSourceRange FilenameRange,
+                          OptionalFileEntryRef File, StringRef SearchPath,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
+                          SrcMgr::CharacteristicKind FileType) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked whenever there was an explicit module-import
+  /// syntax.
+  ///
+  /// \param ImportLoc The location of import directive token.
+  ///
+  /// \param Path The identifiers (and their locations) of the module
+  /// "path", e.g., "std.vector" would be split into "std" and "vector".
+  ///
+  /// \param Imported The imported module; can be null if importing failed.
+  ///
+  void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
+                    const Module *Imported) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked when the end of the main file is reached.
+  ///
+  /// No subsequent callbacks will be made.
+  void EndOfMainFile() override { setSeenNoTrivialPPDirective(); }
+
+  /// Callback invoked when start reading any pragma directive.
+  void PragmaDirective(SourceLocation Loc,
+                       PragmaIntroducerKind Introducer) override {}
+
+  /// Called by Preprocessor::HandleMacroExpandedIdentifier when a
+  /// macro invocation is found.
+  void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override;
+
+  /// Hook called whenever a macro definition is seen.
+  void MacroDefined(const Token &MacroNameTok,
+                    const MacroDirective *MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever a macro \#undef is seen.
+  /// \param MacroNameTok The active Token
+  /// \param MD A MacroDefinition for the named macro.
+  /// \param Undef New MacroDirective if the macro was defined, null otherwise.
+  ///
+  /// MD is released immediately following this callback.
+  void MacroUndefined(const Token &MacroNameTok, const MacroDefinition &MD,
+                      const MacroDirective *Undef) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever the 'defined' operator is seen.
+  /// \param MD The MacroDirective if the name was a macro, null otherwise.
+  void Defined(const Token &MacroNameTok, const MacroDefinition &MD,
+               SourceRange Range) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#if is seen.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param ConditionValue The evaluated value of the condition.
+  ///
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void If(SourceLocation Loc, SourceRange ConditionRange,
+          ConditionValueKind ConditionValue) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elif is seen.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param ConditionValue The evaluated value of the condition.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elif(SourceLocation Loc, SourceRange ConditionRange,
+            ConditionValueKind ConditionValue, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#ifdef is seen.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Ifdef(SourceLocation Loc, const Token &MacroNameTok,
+             const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elifdef branch is taken.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Elifdef(SourceLocation Loc, const Token &MacroNameTok,
+               const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+  /// Hook called whenever an \#elifdef is skipped.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elifdef(SourceLocation Loc, SourceRange ConditionRange,
+               SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#ifndef is seen.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefiniton if the name was a macro, null otherwise.
+  void Ifndef(SourceLocation Loc, const Token &MacroNameTok,
+              const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elifndef branch is taken.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Elifndef(SourceLocation Loc, const Token &MacroNameTok,
+                const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+  /// Hook called whenever an \#elifndef is skipped.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elifndef(SourceLocation Loc, SourceRange ConditionRange,
+                SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#else is seen.
+  /// \param Loc the source location of the directive.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  void Else(SourceLocation Loc, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#endif is seen.
+  /// \param Loc the source location of the directive.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  void Endif(SourceLocation Loc, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4d82e20e5d4f..e90564a9739a 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -82,6 +82,7 @@ class PreprocessorLexer;
 class PreprocessorOptions;
 class ScratchBuffer;
 class TargetInfo;
+class NoTrivialPPDirectiveTracer;
 
 namespace Builtin {
 class Context;
@@ -353,6 +354,11 @@ class Preprocessor {
   /// First pp-token source location in current translation unit.
   SourceLocation FirstPPTokenLoc;
 
+  /// A preprocessor directive tracer to trace whether the preprocessing
+  /// state changed. These changes would mean most semantically observable
+  /// preprocessor state, particularly anything that is order dependent.
+  NoTrivialPPDirectiveTracer *DirTracer = nullptr;
+
   /// A position within a C++20 import-seq.
   class StdCXXImportSeq {
   public:
@@ -609,6 +615,8 @@ class Preprocessor {
       return State == NamedModuleImplementation && !getName().contains(':');
     }
 
+    bool isNotAModuleDecl() const { return State == NotAModuleDecl; }
+
     StringRef getName() const {
       assert(isNamedModule() && "Can't get name from a non named module");
       return Name;
@@ -3087,6 +3095,10 @@ class Preprocessor {
   bool setDeserializedSafeBufferOptOutMap(
       const SmallVectorImpl<SourceLocation> &SrcLocSeqs);
 
+  /// Whether we've seen pp-directives which may have changed the preprocessing
+  /// state.
+  bool hasSeenNoTrivialPPDirective() const;
+
 private:
   /// Helper functions to forward lexing to the actual lexer. They all share the
   /// same signature.
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index fc43e72593b9..d9dc5a562d80 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -86,12 +86,12 @@ class Token {
                                 // macro stringizing or charizing operator.
     CommaAfterElided = 0x200, // The comma following this token was elided (MS).
     IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
-
-    IsReinjected = 0x800,  // A phase 4 token that was produced before and
-                           // re-added, e.g. via EnterTokenStream. Annotation
-                           // tokens are *not* reinjected.
-    FirstPPToken = 0x1000, // This token is the first pp token in the
-                           // translation unit.
+    IsReinjected = 0x800,        // A phase 4 token that was produced before and
+                          // re-added, e.g. via EnterTokenStream. Annotation
+                          // tokens are *not* reinjected.
+    HasSeenNoTrivialPPDirective =
+        0x1000, // Whether we've seen any 'no-trivial' pp-directives before
+                // current position.
   };
 
   tok::TokenKind getKind() const { return Kind; }
@@ -321,8 +321,9 @@ class Token {
   /// lexer uses identifier tokens to represent placeholders.
   bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
 
-  /// Returns true if this token is the first pp-token.
-  bool isFirstPPToken() const { return getFlag(FirstPPToken); }
+  bool hasSeenNoTrivialPPDirective() const {
+    return getFlag(HasSeenNoTrivialPPDirective);
+  }
 };
 
 /// Information about the conditional stack (\#if directives)
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 9135ff949eea..a70335bef9dd 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -350,11 +350,6 @@ class Sema;
     LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToRvalue : 1;
 
-    /// Whether this was an identity conversion with qualification
-    /// conversion for the implicit object argument.
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned IsImplicitObjectArgumentQualificationConversion : 1;
-
     /// Whether this binds an implicit object argument to a
     /// non-static member function without a ref-qualifier.
     LLVM_PREFERRED_TYPE(bool)
@@ -453,11 +448,11 @@ class Sema;
 #endif
         return true;
       }
+      if (!C.hasSameType(getFromType(), getToType(2)))
+        return false;
       if (BindsToRvalue && IsLvalueReference)
         return false;
-      if (IsImplicitObjectArgumentQualificationConversion)
-        return C.hasSameUnqualifiedType(getFromType(), getToType(2));
-      return C.hasSameType(getFromType(), getToType(2));
+      return true;
     }
 
     ImplicitConversionRank getRank() const;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b331acbe606b..7b0be368d67f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9836,7 +9836,7 @@ class Sema final : public SemaBase {
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
                                  ModuleIdPath Path, ModuleIdPath Partition,
                                  ModuleImportState &ImportState,
-                                 bool IntroducerIsFirstPPToken);
+                                 bool SeenNoTrivialPPDirective);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3aa6b3784410..25d19a3dcd48 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -456,7 +456,9 @@ CheckStructurallyEquivalentAttributes(StructuralEquivalenceContext &Context,
                                       const Decl *D1, const Decl *D2,
                                       const Decl *PrimaryDecl = nullptr) {
   // If either declaration has an attribute on it, we treat the declarations
-  // as not being structurally equivalent.
+  // as not being structurally equivalent unless both declarations are implicit
+  // (ones generated by the compiler like __NSConstantString_tag).
+  //
   // FIXME: this should be handled on a case-by-case basis via tablegen in
   // Attr.td. There are multiple cases to consider: one declaration with the
   // attribute, another without it; different attribute syntax|spellings for
@@ -468,7 +470,7 @@ CheckStructurallyEquivalentAttributes(StructuralEquivalenceContext &Context,
     D1Attr = *D1->getAttrs().begin();
   if (D2->hasAttrs())
     D2Attr = *D2->getAttrs().begin();
-  if (D1Attr || D2Attr) {
+  if ((D1Attr || D2Attr) && !D1->isImplicit() && !D2->isImplicit()) {
     const auto *DiagnoseDecl = cast<TypeDecl>(PrimaryDecl ? PrimaryDecl : D2);
     Context.Diag2(DiagnoseDecl->getLocation(),
                   diag::warn_odr_tag_type_with_attributes)
@@ -873,7 +875,29 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
     else if (T1->getTypeClass() == Type::FunctionNoProto &&
              T2->getTypeClass() == Type::FunctionProto)
       TC = Type::FunctionNoProto;
-    else
+    else if (Context.LangOpts.C23 && !Context.StrictTypeSpelling &&
+             (T1->getTypeClass() == Type::Enum ||
+              T2->getTypeClass() == Type::Enum)) {
+      // In C23, if not being strict about token equivalence, we need to handle
+      // the case where one type is an enumeration and the other type is an
+      // integral type.
+      //
+      // C23 6.7.3.3p16: The enumerated type is compatible with the underlying
+      // type of the enumeration.
+      //
+      // Treat the enumeration as its underlying type and use the builtin type
+      // class comparison.
+      if (T1->getTypeClass() == Type::Enum) {
+        T1 = T1->getAs<EnumType>()->getDecl()->getIntegerType();
+        if (!T2->isBuiltinType() || T1.isNull()) // Sanity check
+          return false;
+      } else if (T2->getTypeClass() == Type::Enum) {
+        T2 = T2->getAs<EnumType>()->getDecl()->getIntegerType();
+        if (!T1->isBuiltinType() || T2.isNull()) // Sanity check
+          return false;
+      }
+      TC = Type::Builtin;
+    } else
       return false;
   }
 
@@ -2067,6 +2091,48 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       !CheckStructurallyEquivalentAttributes(Context, D1, D2))
     return false;
 
+  // In C23, if one enumeration has a fixed underlying type, the other shall
+  // have a compatible fixed underlying type (6.2.7).
+  if (Context.LangOpts.C23) {
+    if (D1->isFixed() != D2->isFixed()) {
+      if (Context.Complain) {
+        Context.Diag2(D2->getLocation(),
+                      Context.getApplicableDiagnostic(
+                          diag::err_odr_tag_type_inconsistent))
+            << Context.ToCtx.getTypeDeclType(D2)
+            << (&Context.FromCtx != &Context.ToCtx);
+        Context.Diag1(D1->getLocation(),
+                      D1->isFixed()
+                          ? diag::note_odr_fixed_underlying_type
+                          : diag::note_odr_missing_fixed_underlying_type)
+            << D1;
+        Context.Diag2(D2->getLocation(),
+                      D2->isFixed()
+                          ? diag::note_odr_fixed_underlying_type
+                          : diag::note_odr_missing_fixed_underlying_type)
+            << D2;
+      }
+      return false;
+    }
+    if (D1->isFixed()) {
+      assert(D2->isFixed() && "enums expected to have fixed underlying types");
+      if (!IsStructurallyEquivalent(Context, D1->getIntegerType(),
+                                    D2->getIntegerType())) {
+        if (Context.Complain) {
+          Context.Diag2(D2->getLocation(),
+                        Context.getApplicableDiagnostic(
+                            diag::err_odr_tag_type_inconsistent))
+              << Context.ToCtx.getTypeDeclType(D2)
+              << (&Context.FromCtx != &Context.ToCtx);
+          Context.Diag2(D2->getLocation(),
+                        diag::note_odr_incompatible_fixed_underlying_type)
+              << D2 << D2->getIntegerType() << D1->getIntegerType();
+        }
+        return false;
+      }
+    }
+  }
+
   llvm::SmallVector<const EnumConstantDecl *, 8> D1Enums, D2Enums;
   auto CopyEnumerators =
       [](auto &&Range, llvm::SmallVectorImpl<const EnumConstantDecl *> &Cont) {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 419dd5dbdc69..170da16a0cdf 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -4441,7 +4441,8 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         }
       } else if (!IsAccess) {
         return CompleteObject(LVal.getLValueBase(), nullptr, BaseType);
-      } else if (IsConstant && Info.checkingPotentialConstantExpression() &&
+      } else if ((IsConstant || BaseType->isReferenceType()) &&
+                 Info.checkingPotentialConstantExpression() &&
                  BaseType->isLiteralType(Info.Ctx) && !VD->hasDefinition()) {
         // This variable might end up being constexpr. Don't diagnose it yet.
       } else if (IsConstant) {
@@ -4478,9 +4479,11 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
     // a null BaseVal. Any constexpr-unknown variable seen here is an error:
     // we can't access a constexpr-unknown object.
     if (!BaseVal) {
-      Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
-          << AK << VD;
-      Info.Note(VD->getLocation(), diag::note_declared_at);
+      if (!Info.checkingPotentialConstantExpression()) {
+        Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
+            << AK << VD;
+        Info.Note(VD->getLocation(), diag::note_declared_at);
+      }
       return CompleteObject();
     }
   } else if (DynamicAllocLValue DA = LVal.Base.dyn_cast<DynamicAllocLValue>()) {
@@ -10929,10 +10932,6 @@ bool RecordExprEvaluator::VisitCXXConstructExpr(const CXXConstructExpr *E,
 
   bool ZeroInit = E->requiresZeroInitialization();
   if (CheckTrivialDefaultConstructor(Info, E->getExprLoc(), FD, ZeroInit)) {
-    // If we've already performed zero-initialization, we're already done.
-    if (Result.hasValue())
-      return true;
-
     if (ZeroInit)
       return ZeroInitialization(E, T);
 
@@ -14478,12 +14477,6 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     if (!EvaluatePointer(E->getRHS(), RHSValue, Info) || !LHSOK)
       return false;
 
-    // If we have Unknown pointers we should fail if they are not global values.
-    if (!(IsGlobalLValue(LHSValue.getLValueBase()) &&
-          IsGlobalLValue(RHSValue.getLValueBase())) &&
-        (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown))
-      return false;
-
     // Reject differing bases from the normal codepath; we special-case
     // comparisons to null.
     if (!HasSameBase(LHSValue, RHSValue)) {
@@ -14545,6 +14538,10 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
           (LHSValue.Base && isZeroSized(RHSValue)))
         return DiagComparison(
             diag::note_constexpr_pointer_comparison_zero_sized);
+      if (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown)
+        return DiagComparison(
+            diag::note_constexpr_pointer_comparison_unspecified);
+      // FIXME: Verify both variables are live.
       return Success(CmpResult::Unequal, E);
     }
 
@@ -14565,7 +14562,9 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     if (!LHSDesignator.Invalid && !RHSDesignator.Invalid && IsRelational) {
       bool WasArrayIndex;
       unsigned Mismatch = FindDesignatorMismatch(
-          getType(LHSValue.Base), LHSDesignator, RHSDesignator, WasArrayIndex);
+          LHSValue.Base.isNull() ? QualType()
+                                 : getType(LHSValue.Base).getNonReferenceType(),
+          LHSDesignator, RHSDesignator, WasArrayIndex);
       // At the point where the designators diverge, the comparison has a
       // specified value if:
       //  - we are comparing array indices
@@ -14609,7 +14608,7 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     // compare pointers within the object in question; otherwise, the result
     // depends on where the object is located in memory.
     if (!LHSValue.Base.isNull() && IsRelational) {
-      QualType BaseTy = getType(LHSValue.Base);
+      QualType BaseTy = getType(LHSValue.Base).getNonReferenceType();
       if (BaseTy->isIncompleteType())
         return Error(E);
       CharUnits Size = Info.Ctx.getTypeSizeInChars(BaseTy);
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index 80e7c8eff671..dadb0b757a2c 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -1331,7 +1331,7 @@ void ThreadSafetyAnalyzer::addLock(FactSet &FSet,
       FSet.removeLock(FactMan, NegC);
     }
     else {
-      if (inCurrentScope(*Entry) && !Entry->asserted())
+      if (inCurrentScope(*Entry) && !Entry->asserted() && !Entry->reentrant())
         Handler.handleNegativeNotHeld(Entry->getKind(), Entry->toString(),
                                       NegC.toString(), Entry->loc());
     }
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index af1111a86330..5d11578893c6 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -769,6 +769,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
                                                                         Opts);
+    case llvm::Triple::OpenBSD:
+      return std::make_unique<OpenBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
+                                                                        Opts);
     default:
       return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index f6915df1520b..8e29bb745734 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -461,6 +461,8 @@ LoongArchTargetInfo::parseTargetAttr(StringRef Features) const {
 
     case AttrFeatureKind::Feature:
       Ret.Features.push_back("+" + Value.str());
+      if (Value == "lasx")
+        Ret.Features.push_back("+lsx");
       break;
     }
   }
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 35501ed44ccd..e199df32f56e 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -129,7 +129,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     LongWidth = LongAlign = 32;
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
     SuitableAlign = 64;
   }
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 64;
     PointerWidth = PointerAlign = 64;
-    PtrDiffType = SignedLong;
+    PtrDiffType = IntPtrType = SignedLong;
     SizeType = UnsignedLong;
   }
 
@@ -165,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
   }
 
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 30d861a7ca60..c1a68f464e83 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -174,6 +174,9 @@ class LLVM_LIBRARY_VISIBILITY DragonFlyBSDTargetInfo
     DefineStd(Builder, "unix", Opts);
     if (this->HasFloat128)
       Builder.defineMacro("__FLOAT128__");
+
+    if (Opts.C11)
+      Builder.defineMacro("__STDC_NO_THREADS__");
   }
 
 public:
@@ -496,6 +499,7 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
     case llvm::Triple::sparcv9:
       this->MCountName = "_mcount";
       break;
+    case llvm::Triple::loongarch64:
     case llvm::Triple::riscv64:
       break;
     }
diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h
index a748ddaa110a..4e7f3561ac04 100644
--- a/clang/lib/CodeGen/Address.h
+++ b/clang/lib/CodeGen/Address.h
@@ -176,6 +176,11 @@ class Address {
   static Address invalid() { return Address(nullptr); }
   bool isValid() const { return Pointer.getPointer() != nullptr; }
 
+  llvm::Value *getPointerIfNotSigned() const {
+    assert(isValid() && "pointer isn't valid");
+    return !isSigned() ? Pointer.getPointer() : nullptr;
+  }
+
   /// This function is used in situations where the caller is doing some sort of
   /// opaque "laundering" of the pointer.
   void replaceBasePointer(llvm::Value *P) {
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 1aba841eb5fc..c1f5b983f723 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -188,13 +188,14 @@ static llvm::Constant *buildBlockDescriptor(CodeGenModule &CGM,
   // Optional copy/dispose helpers.
   bool hasInternalHelper = false;
   if (blockInfo.NeedsCopyDispose) {
+    auto &Schema = CGM.getCodeGenOpts().PointerAuth.BlockHelperFunctionPointers;
     // copy_func_helper_decl
     llvm::Constant *copyHelper = buildCopyHelper(CGM, blockInfo);
-    elements.add(copyHelper);
+    elements.addSignedPointer(copyHelper, Schema, GlobalDecl(), QualType());
 
     // destroy_func_decl
     llvm::Constant *disposeHelper = buildDisposeHelper(CGM, blockInfo);
-    elements.add(disposeHelper);
+    elements.addSignedPointer(disposeHelper, Schema, GlobalDecl(), QualType());
 
     if (cast<llvm::Function>(copyHelper->stripPointerCasts())
             ->hasInternalLinkage() ||
@@ -567,9 +568,8 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF,
       llvm::StructType::get(CGM.getLLVMContext(), elementTypes, true);
     info.CanBeGlobal = true;
     return;
-  }
-  else if (C.getLangOpts().ObjC &&
-           CGM.getLangOpts().getGC() == LangOptions::NonGC)
+  } else if (C.getLangOpts().ObjC &&
+             CGM.getLangOpts().getGC() == LangOptions::NonGC)
     info.HasCapturedVariableLayout = true;
 
   if (block->doesNotEscape())
@@ -783,7 +783,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr) {
 
 llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
   bool IsOpenCL = CGM.getContext().getLangOpts().OpenCL;
-  auto GenVoidPtrTy =
+  llvm::PointerType *GenVoidPtrTy =
       IsOpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy;
   LangAS GenVoidPtrAddr = IsOpenCL ? LangAS::opencl_generic : LangAS::Default;
   auto GenVoidPtrSize = CharUnits::fromQuantity(
@@ -817,9 +817,6 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                                    : CGM.getNSConcreteStackBlock();
     isa = blockISA;
 
-    // Build the block descriptor.
-    descriptor = buildBlockDescriptor(CGM, blockInfo);
-
     // Compute the initial on-stack block flags.
     if (!CGM.getCodeGenOpts().DisableBlockSignatureString)
       flags = BLOCK_HAS_SIGNATURE;
@@ -833,6 +830,9 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       flags |= BLOCK_USE_STRET;
     if (blockInfo.NoEscape)
       flags |= BLOCK_IS_NOESCAPE | BLOCK_IS_GLOBAL;
+
+    // Build the block descriptor.
+    descriptor = buildBlockDescriptor(CGM, blockInfo);
   }
 
   auto projectField = [&](unsigned index, const Twine &name) -> Address {
@@ -883,11 +883,25 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
           llvm::ConstantInt::get(IntTy, blockInfo.BlockAlign.getQuantity()),
           getIntSize(), "block.align");
     }
-    addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
-    if (!IsOpenCL)
-      addHeaderField(descriptor, getPointerSize(), "block.descriptor");
-    else if (auto *Helper =
-                 CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+
+    if (!IsOpenCL) {
+      llvm::Value *blockFnPtr =
+          llvm::ConstantExpr::getBitCast(InvokeFn, VoidPtrTy);
+      QualType type = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+      addSignedHeaderField(
+          blockFnPtr,
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers,
+          GlobalDecl(), type, getPointerSize(), "block.invoke");
+
+      addSignedHeaderField(
+          descriptor, CGM.getCodeGenOpts().PointerAuth.BlockDescriptorPointers,
+          GlobalDecl(), type, getPointerSize(), "block.descriptor");
+    } else if (auto *Helper =
+                   CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
       for (auto I : Helper->getCustomFieldValues(*this, blockInfo)) {
         addHeaderField(
             I.first,
@@ -895,7 +909,8 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                 CGM.getDataLayout().getTypeAllocSize(I.first->getType())),
             I.second);
       }
-    }
+    } else
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
   }
 
   // Finally, capture all the values into the block.
@@ -1166,6 +1181,8 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
   ASTContext &Ctx = getContext();
   CallArgList Args;
 
+  llvm::Value *FuncPtr = nullptr;
+
   if (getLangOpts().OpenCL) {
     // For OpenCL, BlockPtr is already casted to generic block literal.
 
@@ -1185,7 +1202,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     if (!isa<ParmVarDecl>(E->getCalleeDecl()))
       Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
     else {
-      llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
+      FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
       Func = Builder.CreateAlignedLoad(GenericVoidPtrTy, FuncPtr,
                                        getPointerAlign());
     }
@@ -1194,7 +1211,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     BlockPtr =
         Builder.CreatePointerCast(BlockPtr, UnqualPtrTy, "block.literal");
     // Get pointer to the block invoke function
-    llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
+    FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
 
     // First argument is a block literal casted to a void pointer
     BlockPtr = Builder.CreatePointerCast(BlockPtr, VoidPtrTy);
@@ -1211,7 +1228,15 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     CGM.getTypes().arrangeBlockFunctionCall(Args, FuncTy);
 
   // Prepare the callee.
-  CGCallee Callee(CGCalleeInfo(), Func);
+  CGPointerAuthInfo PointerAuth;
+  if (auto &AuthSchema =
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers) {
+    assert(FuncPtr != nullptr && "Missing function pointer for AuthInfo");
+    PointerAuth =
+        EmitPointerAuthInfo(AuthSchema, FuncPtr, GlobalDecl(), FnType);
+  }
+
+  CGCallee Callee(CGCalleeInfo(), Func, PointerAuth);
 
   // And call the block.
   return EmitCall(FnInfo, Callee, ReturnValue, Args, CallOrInvoke);
@@ -1295,14 +1320,15 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
 
   bool IsOpenCL = CGM.getLangOpts().OpenCL;
   bool IsWindows = CGM.getTarget().getTriple().isOSWindows();
+  auto &CGOPointerAuth = CGM.getCodeGenOpts().PointerAuth;
   if (!IsOpenCL) {
     // isa
     if (IsWindows)
       fields.addNullPointer(CGM.Int8PtrPtrTy);
     else
       fields.addSignedPointer(CGM.getNSConcreteGlobalBlock(),
-                              CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers,
-                              GlobalDecl(), QualType());
+                              CGOPointerAuth.ObjCIsaPointers, GlobalDecl(),
+                              QualType());
 
     // __flags
     BlockFlags flags = BLOCK_IS_GLOBAL;
@@ -1321,11 +1347,20 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
   }
 
   // Function
-  fields.add(blockFn);
+  if (auto &Schema = CGOPointerAuth.BlockInvocationFunctionPointers) {
+    QualType FnType = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+    fields.addSignedPointer(blockFn, Schema, GlobalDecl(), FnType);
+  } else
+    fields.add(blockFn);
 
   if (!IsOpenCL) {
     // Descriptor
-    fields.add(buildBlockDescriptor(CGM, blockInfo));
+    llvm::Constant *Descriptor = buildBlockDescriptor(CGM, blockInfo);
+    fields.addSignedPointer(Descriptor, CGOPointerAuth.BlockDescriptorPointers,
+                            GlobalDecl(), QualType());
   } else if (auto *Helper =
                  CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
     for (auto *I : Helper->getCustomFieldValues(CGM, blockInfo)) {
@@ -1995,8 +2030,8 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
         // it. It's not quite worth the annoyance to avoid creating it in the
         // first place.
         if (!needsEHCleanup(captureType.isDestructedType()))
-          if (auto *I =
-                  cast_or_null<llvm::Instruction>(dstField.getBasePointer()))
+          if (auto *I = cast_or_null<llvm::Instruction>(
+                  dstField.getPointerIfNotSigned()))
             I->eraseFromParent();
       }
       break;
@@ -2730,8 +2765,16 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   unsigned nextHeaderIndex = 0;
   CharUnits nextHeaderOffset;
   auto storeHeaderField = [&](llvm::Value *value, CharUnits fieldSize,
-                              const Twine &name) {
+                              const Twine &name, bool isFunction = false) {
     auto fieldAddr = Builder.CreateStructGEP(addr, nextHeaderIndex, name);
+    if (isFunction) {
+      if (auto &Schema = CGM.getCodeGenOpts()
+                             .PointerAuth.BlockByrefHelperFunctionPointers) {
+        auto PointerAuth = EmitPointerAuthInfo(
+            Schema, fieldAddr.emitRawPointer(*this), GlobalDecl(), QualType());
+        value = EmitPointerAuthSign(PointerAuth, value);
+      }
+    }
     Builder.CreateStore(value, fieldAddr);
 
     nextHeaderIndex++;
@@ -2814,10 +2857,10 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   storeHeaderField(V, getIntSize(), "byref.size");
 
   if (helpers) {
-    storeHeaderField(helpers->CopyHelper, getPointerSize(),
-                     "byref.copyHelper");
+    storeHeaderField(helpers->CopyHelper, getPointerSize(), "byref.copyHelper",
+                     /*isFunction=*/true);
     storeHeaderField(helpers->DisposeHelper, getPointerSize(),
-                     "byref.disposeHelper");
+                     "byref.disposeHelper", /*isFunction=*/true);
   }
 
   if (ByRefHasLifetime && HasByrefExtendedLayout) {
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 0fc488e98aaf..117ef3d16e21 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -707,11 +707,15 @@ struct GetReturnObjectManager {
     Builder.CreateStore(Builder.getFalse(), GroActiveFlag);
 
     GroEmission = CGF.EmitAutoVarAlloca(*GroVarDecl);
-    auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
-        GroEmission.getOriginalAllocatedAddress().getPointer());
-    assert(GroAlloca && "expected alloca to be emitted");
-    GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
-                           llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+
+    if (!GroVarDecl->isNRVOVariable()) {
+      // NRVO variables don't have allocas and won't have the same issue.
+      auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
+          GroEmission.getOriginalAllocatedAddress().getPointer());
+      assert(GroAlloca && "expected alloca to be emitted");
+      GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
+                             llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+    }
 
     // Remember the top of EHStack before emitting the cleanup.
     auto old_top = CGF.EHStack.stable_begin();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f97c7b644598..ec28bd259e8e 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -170,6 +170,10 @@ void CGDebugInfo::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction,
   if (!Group || !CGM.getCodeGenOpts().DebugKeyInstructions)
     return;
 
+  llvm::DISubprogram *SP = KeyInstruction->getFunction()->getSubprogram();
+  if (!SP || !SP->getKeyInstructionsEnabled())
+    return;
+
   addInstSourceAtomMetadata(KeyInstruction, Group, /*Rank=*/1);
 
   llvm::Instruction *BackupI =
@@ -2641,7 +2645,8 @@ StringRef CGDebugInfo::getVTableName(const CXXRecordDecl *RD) {
 // existing information in the DWARF. The type is assumed to be 'void *'.
 void CGDebugInfo::emitVTableSymbol(llvm::GlobalVariable *VTable,
                                    const CXXRecordDecl *RD) {
-  if (!CGM.getTarget().getCXXABI().isItaniumFamily())
+  if (!CGM.getTarget().getCXXABI().isItaniumFamily() ||
+      CGM.getTarget().getTriple().isOSBinFormatCOFF())
     return;
 
   ASTContext &Context = CGM.getContext();
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 359e30cb8f5c..912b1d72c7e2 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2313,7 +2313,8 @@ llvm::Value *CodeGenFunction::EmitDynamicCast(Address ThisAddr,
   bool IsExact = !IsDynamicCastToVoid &&
                  CGM.getCodeGenOpts().OptimizationLevel > 0 &&
                  DestRecordTy->getAsCXXRecordDecl()->isEffectivelyFinal() &&
-                 CGM.getCXXABI().shouldEmitExactDynamicCast(DestRecordTy);
+                 CGM.getCXXABI().shouldEmitExactDynamicCast(DestRecordTy) &&
+                 !getLangOpts().PointerAuthCalls;
 
   // C++ [expr.dynamic.cast]p4:
   //   If the value of v is a null pointer value in the pointer case, the result
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 715bd392f59f..0cc468ca3ab7 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -873,8 +873,9 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
 
     for (const BaseInfo &Base : Bases) {
       bool IsPrimaryBase = Layout.getPrimaryBase() == Base.Decl;
-      Build(Val.getStructBase(Base.Index), Base.Decl, IsPrimaryBase,
-            VTableClass, Offset + Base.Offset);
+      if (!Build(Val.getStructBase(Base.Index), Base.Decl, IsPrimaryBase,
+                 VTableClass, Offset + Base.Offset))
+        return false;
     }
   }
 
@@ -1620,7 +1621,7 @@ llvm::Constant *ConstantEmitter::tryEmitConstantExpr(const ConstantExpr *CE) {
   if (CE->isGLValue())
     RetType = CGM.getContext().getLValueReferenceType(RetType);
 
-  return emitAbstract(CE->getBeginLoc(), CE->getAPValueResult(), RetType);
+  return tryEmitAbstract(CE->getAPValueResult(), RetType);
 }
 
 llvm::Constant *
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index e0650067b954..1a8c6f015bda 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -846,11 +846,13 @@ void CodeGenFunction::EmitGotoStmt(const GotoStmt &S) {
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
+  ApplyAtomGroup Grp(getDebugInfo());
   EmitBranchThroughCleanup(getJumpDestForLabel(S.getLabel()));
 }
 
 
 void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
+  ApplyAtomGroup Grp(getDebugInfo());
   if (const LabelDecl *Target = S.getConstantTarget()) {
     EmitBranchThroughCleanup(getJumpDestForLabel(Target));
     return;
@@ -869,6 +871,8 @@ void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
   cast<llvm::PHINode>(IndGotoBB->begin())->addIncoming(V, CurBB);
 
   EmitBranch(IndGotoBB);
+  if (CurBB && CurBB->getTerminator())
+    addInstToCurrentSourceAtom(CurBB->getTerminator(), nullptr);
 }
 
 void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
@@ -2672,6 +2676,9 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
+  // Make inline-asm calls Key for the debug info feature Key Instructions.
+  CGF.addInstToNewSourceAtom(&Result, nullptr);
+
   if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
@@ -2750,6 +2757,7 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       }
     }
 
+    ApplyAtomGroup Grp(CGF.getDebugInfo());
     LValue Dest = ResultRegDests[i];
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
@@ -2757,7 +2765,8 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress().withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
-        Builder.CreateStore(Tmp, A);
+        llvm::StoreInst *S = Builder.CreateStore(Tmp, A);
+        CGF.addInstToCurrentSourceAtom(S, S->getValueOperand());
         continue;
       }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 6c32c98cec01..f5ac9f387c64 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -727,7 +727,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   };
 
   /// Header for data within LifetimeExtendedCleanupStack.
-  struct LifetimeExtendedCleanupHeader {
+  struct alignas(uint64_t) LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
     unsigned Size;
     /// The kind of cleanup to push.
@@ -949,7 +949,8 @@ class CodeGenFunction : public CodeGenTypeCache {
         LifetimeExtendedCleanupStack.size() + sizeof(Header) + Header.Size +
         (Header.IsConditional ? sizeof(ActiveFlag) : 0));
 
-    static_assert(sizeof(Header) % alignof(T) == 0,
+    static_assert((alignof(LifetimeExtendedCleanupHeader) == alignof(T)) &&
+                      (alignof(T) == alignof(RawAddress)),
                   "Cleanup will be allocated on misaligned address");
     char *Buffer = &LifetimeExtendedCleanupStack[OldSize];
     new (Buffer) LifetimeExtendedCleanupHeader(Header);
diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h
index ed11dc2bb05d..54f6ceaa52b9 100644
--- a/clang/lib/CodeGen/EHScopeStack.h
+++ b/clang/lib/CodeGen/EHScopeStack.h
@@ -143,7 +143,7 @@ class EHScopeStack {
   ///
   /// Cleanup implementations should generally be declared in an
   /// anonymous namespace.
-  class Cleanup {
+  class alignas(uint64_t) Cleanup {
     // Anchor the construction vtable.
     virtual void anchor();
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 3f9b808b2722..07a3ae925f96 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -837,17 +837,30 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
 
 void ToolChain::addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args,
                                              ArgStringList &CmdArgs) const {
-  // Default to the <driver-path>/../lib directory. This works fine on the
-  // platforms that we have tested so far. We will probably have to re-fine
-  // this in the future. In particular, on some platforms, we may need to use
-  // lib64 instead of lib.
+  auto AddLibSearchPathIfExists = [&](const Twine &Path) {
+    // Linker may emit warnings about non-existing directories
+    if (!llvm::sys::fs::is_directory(Path))
+      return;
+
+    if (getTriple().isKnownWindowsMSVCEnvironment())
+      CmdArgs.push_back(Args.MakeArgString("-libpath:" + Path));
+    else
+      CmdArgs.push_back(Args.MakeArgString("-L" + Path));
+  };
+
+  // Search for flang_rt.* at the same location as clang_rt.* with
+  // LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=0. On most platforms, flang_rt is
+  // located at the path returned by getRuntimePath() which is already added to
+  // the library search path. This exception is for Apple-Darwin.
+  AddLibSearchPathIfExists(getCompilerRTPath());
+
+  // Fall back to the non-resource directory <driver-path>/../lib. We will
+  // probably have to refine this in the future. In particular, on some
+  // platforms, we may need to use lib64 instead of lib.
   SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(getDriver().Dir);
   llvm::sys::path::append(DefaultLibPath, "lib");
-  if (getTriple().isKnownWindowsMSVCEnvironment())
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath));
-  else
-    CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
+  AddLibSearchPathIfExists(DefaultLibPath);
 }
 
 void ToolChain::addFlangRTLibPath(const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 6bd710ec6b8b..418f9fd9ca4c 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -467,3 +467,18 @@ void aarch64::setPAuthABIInTriple(const Driver &D, const ArgList &Args,
     break;
   }
 }
+
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+bool aarch64::isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.h b/clang/lib/Driver/ToolChains/Arch/AArch64.h
index 2057272867a1..2765ee8c3a6a 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.h
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.h
@@ -30,6 +30,7 @@ std::string getAArch64TargetCPU(const llvm::opt::ArgList &Args,
 
 void setPAuthABIInTriple(const Driver &D, const llvm::opt::ArgList &Args,
                          llvm::Triple &triple);
+bool isAArch64BareMetal(const llvm::Triple &Triple);
 
 } // end namespace aarch64
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 9595ee8383c8..94a94f1e9c48 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -23,7 +23,9 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
   if (Triple.getArch() == llvm::Triple::sparcv9) {
     const char *DefV9CPU;
 
-    if (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD())
+    if (Triple.isOSSolaris())
+      DefV9CPU = "-Av9b";
+    else if (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD())
       DefV9CPU = "-Av9a";
     else
       DefV9CPU = "-Av9";
@@ -35,6 +37,13 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
         .Case("niagara4", "-Av9d")
         .Default(DefV9CPU);
   } else {
+    const char *DefV8CPU;
+
+    if (Triple.isOSSolaris())
+      DefV8CPU = "-Av8plus";
+    else
+      DefV8CPU = "-Av8";
+
     return llvm::StringSwitch<const char *>(Name)
         .Case("v8", "-Av8")
         .Case("supersparc", "-Av8")
@@ -70,7 +79,7 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
         .Case("gr712rc", "-Aleon")
         .Case("leon4", "-Aleon")
         .Case("gr740", "-Aleon")
-        .Default("-Av8");
+        .Default(DefV8CPU);
   }
 }
 
@@ -130,7 +139,8 @@ std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
   return "";
 }
 
-void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
+void sparc::getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                                   const ArgList &Args,
                                    std::vector<StringRef> &Features) {
   sparc::FloatABI FloatABI = sparc::getSparcFloatABI(D, Args);
   if (FloatABI == sparc::FloatABI::Soft)
@@ -150,11 +160,22 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
       Features.push_back("-popc");
   }
 
+  // Those OSes default to enabling VIS on 64-bit SPARC.
+  // See also the corresponding code for external assemblers in
+  // sparc::getSparcAsmModeForCPU().
+  bool IsSparcV9ATarget =
+      (Triple.getArch() == llvm::Triple::sparcv9) &&
+      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
+  bool IsSparcV9BTarget = Triple.isOSSolaris();
+  bool IsSparcV8PlusTarget =
+      Triple.getArch() == llvm::Triple::sparc && Triple.isOSSolaris();
   if (Arg *A = Args.getLastArg(options::OPT_mvis, options::OPT_mno_vis)) {
     if (A->getOption().matches(options::OPT_mvis))
       Features.push_back("+vis");
     else
       Features.push_back("-vis");
+  } else if (IsSparcV9ATarget) {
+    Features.push_back("+vis");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mvis2, options::OPT_mno_vis2)) {
@@ -162,6 +183,8 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
       Features.push_back("+vis2");
     else
       Features.push_back("-vis2");
+  } else if (IsSparcV9BTarget) {
+    Features.push_back("+vis2");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mvis3, options::OPT_mno_vis3)) {
@@ -182,6 +205,8 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
   if (Arg *A = Args.getLastArg(options::OPT_mv8plus, options::OPT_mno_v8plus)) {
     if (A->getOption().matches(options::OPT_mv8plus))
       Features.push_back("+v8plus");
+  } else if (IsSparcV8PlusTarget) {
+    Features.push_back("+v8plus");
   }
 
   if (Args.hasArg(options::OPT_ffixed_g1))
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.h b/clang/lib/Driver/ToolChains/Arch/Sparc.h
index 2b178d9df1ee..fa25b4992cc8 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.h
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.h
@@ -31,7 +31,8 @@ FloatABI getSparcFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 std::string getSparcTargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
                               const llvm::Triple &Triple);
 
-void getSparcTargetFeatures(const Driver &D, const llvm::opt::ArgList &Args,
+void getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                            const llvm::opt::ArgList &Args,
                             std::vector<llvm::StringRef> &Features);
 const char *getSparcAsmModeForCPU(llvm::StringRef Name,
                                   const llvm::Triple &Triple);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index e670696cd59a..207150ea6f32 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -12,6 +12,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/InputInfo.h"
 
+#include "Arch/AArch64.h"
 #include "Arch/ARM.h"
 #include "Arch/RISCV.h"
 #include "clang/Driver/Compilation.h"
@@ -31,21 +32,6 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
 static bool isRISCVBareMetal(const llvm::Triple &Triple) {
   if (!Triple.isRISCV())
     return false;
@@ -363,8 +349,9 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
 }
 
 bool BareMetal::handlesTarget(const llvm::Triple &Triple) {
-  return arm::isARMEABIBareMetal(Triple) || isAArch64BareMetal(Triple) ||
-         isRISCVBareMetal(Triple) || isPPCBareMetal(Triple);
+  return arm::isARMEABIBareMetal(Triple) ||
+         aarch64::isAArch64BareMetal(Triple) || isRISCVBareMetal(Triple) ||
+         isPPCBareMetal(Triple);
 }
 
 Tool *BareMetal::buildLinker() const {
@@ -694,9 +681,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       NeedCRTs)
     CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
 
-  if (TC.getTriple().isRISCV())
-    CmdArgs.push_back("-X");
-
   // The R_ARM_TARGET2 relocation must be treated as R_ARM_REL32 on arm*-*-elf
   // and arm*-*-eabi (the default is R_ARM_GOT_PREL, used on arm*-*-linux and
   // arm*-*-*bsd).
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 456bfe885f35..62613322320c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1752,7 +1752,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
                     options::OPT_fno_ptrauth_objc_interface_sel);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_class_ro,
                     options::OPT_fno_ptrauth_objc_class_ro);
-
   if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
     handlePAuthABI(Args, CmdArgs);
 
@@ -2731,16 +2730,6 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
     CmdArgs.push_back(MipsTargetFeature);
   }
 
-  // Those OSes default to enabling VIS on 64-bit SPARC.
-  // See also the corresponding code for external assemblers in
-  // sparc::getSparcAsmModeForCPU().
-  bool IsSparcV9ATarget =
-      (C.getDefaultToolChain().getArch() == llvm::Triple::sparcv9) &&
-      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
-  if (IsSparcV9ATarget && SparcTargetFeatures.empty()) {
-    CmdArgs.push_back("-target-feature");
-    CmdArgs.push_back("+vis");
-  }
   for (const char *Feature : SparcTargetFeatures) {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(Feature);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 097d186ad8ea..8d3775de9be5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -547,15 +547,22 @@ const char *tools::getLDMOption(const llvm::Triple &T, const ArgList &Args) {
   case llvm::Triple::aarch64:
     if (T.isOSManagarm())
       return "aarch64managarm";
+    else if (aarch64::isAArch64BareMetal(T))
+      return "aarch64elf";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
+    if (aarch64::isAArch64BareMetal(T))
+      return "aarch64elfb";
     return "aarch64linuxb";
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
   case llvm::Triple::armeb:
-  case llvm::Triple::thumbeb:
-    return tools::arm::isARMBigEndian(T, Args) ? "armelfb_linux_eabi"
-                                               : "armelf_linux_eabi";
+  case llvm::Triple::thumbeb: {
+    bool IsBigEndian = tools::arm::isARMBigEndian(T, Args);
+    if (arm::isARMEABIBareMetal(T))
+      return IsBigEndian ? "armelfb" : "armelf";
+    return IsBigEndian ? "armelfb_linux_eabi" : "armelf_linux_eabi";
+  }
   case llvm::Triple::m68k:
     return "m68kelf";
   case llvm::Triple::ppc:
@@ -856,7 +863,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
-    sparc::getSparcTargetFeatures(D, Args, Features);
+    sparc::getSparcTargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::r600:
   case llvm::Triple::amdgcn:
@@ -1320,6 +1327,17 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (Args.hasArg(options::OPT_ftime_report))
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
+
+  if (Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-remote-compiler=" +
+                           Twine(ToolChain.getDriver().getClangProgramPath())));
+
+    for (auto A : Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
+      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
+  }
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
@@ -3247,14 +3265,8 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args,
 
 void tools::handleInterchangeLoopsArgs(const ArgList &Args,
                                        ArgStringList &CmdArgs) {
-  // FIXME: instead of relying on shouldEnableVectorizerAtOLevel, we may want to
-  // implement a separate function to infer loop interchange from opt level.
-  // For now, enable loop-interchange at the same opt levels as loop-vectorize.
-  bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false);
-  OptSpecifier InterchangeAliasOption =
-      EnableInterchange ? options::OPT_O_Group : options::OPT_floop_interchange;
-  if (Args.hasFlag(options::OPT_floop_interchange, InterchangeAliasOption,
-                   options::OPT_fno_loop_interchange, EnableInterchange))
+  if (Args.hasFlag(options::OPT_floop_interchange,
+                   options::OPT_fno_loop_interchange, false))
     CmdArgs.push_back("-floop-interchange");
 }
 
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index e5075cbcaf66..234683f2f488 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -3187,28 +3187,46 @@ void MachO::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
 
   ToolChain::addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadKind);
 
-  // On arm64e, enable pointer authentication (for the return address and
-  // indirect calls), as well as usage of the intrinsics.
-  if (getArchName() == "arm64e") {
+  // On arm64e, we enable all the features required for the Darwin userspace
+  // ABI
+  if (getTriple().isArm64e()) {
+    // Core platform ABI
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
+                           options::OPT_fno_ptrauth_calls))
+      CC1Args.push_back("-fptrauth-calls");
     if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
                            options::OPT_fno_ptrauth_returns))
       CC1Args.push_back("-fptrauth-returns");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
                            options::OPT_fno_ptrauth_intrinsics))
       CC1Args.push_back("-fptrauth-intrinsics");
-
-    if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
-                           options::OPT_fno_ptrauth_calls))
-      CC1Args.push_back("-fptrauth-calls");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
                            options::OPT_fno_ptrauth_indirect_gotos))
       CC1Args.push_back("-fptrauth-indirect-gotos");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
                            options::OPT_fno_ptrauth_auth_traps))
       CC1Args.push_back("-fptrauth-auth-traps");
+
+    // C++ v-table ABI
+    if (!DriverArgs.hasArg(
+            options::OPT_fptrauth_vtable_pointer_address_discrimination,
+            options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
+      CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
+    if (!DriverArgs.hasArg(
+            options::OPT_fptrauth_vtable_pointer_type_discrimination,
+            options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
+      CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
+
+    // Objective-C ABI
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_isa,
+                           options::OPT_fno_ptrauth_objc_isa))
+      CC1Args.push_back("-fptrauth-objc-isa");
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_class_ro,
+                           options::OPT_fno_ptrauth_objc_class_ro))
+      CC1Args.push_back("-fptrauth-objc-class-ro");
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_interface_sel,
+                           options::OPT_fno_ptrauth_objc_interface_sel))
+      CC1Args.push_back("-fptrauth-objc-interface-sel");
   }
 }
 
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1edb83f7255e..7ab41e9b85a0 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -447,6 +447,7 @@ void Flang::addTargetOptions(const ArgList &Args,
   // Add the target features.
   switch (TC.getArch()) {
   default:
+    getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
     break;
   case llvm::Triple::aarch64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index b2e36ae6f97c..4894e6a52e93 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -85,11 +85,18 @@ void tools::MinGW::Linker::AddLibGCC(const ArgList &Args,
 
   CmdArgs.push_back("-lmoldname");
   CmdArgs.push_back("-lmingwex");
-  for (auto Lib : Args.getAllArgValues(options::OPT_l))
+  for (auto Lib : Args.getAllArgValues(options::OPT_l)) {
     if (StringRef(Lib).starts_with("msvcr") ||
         StringRef(Lib).starts_with("ucrt") ||
-        StringRef(Lib).starts_with("crtdll"))
+        StringRef(Lib).starts_with("crtdll")) {
+      std::string CRTLib = (llvm::Twine("-l") + Lib).str();
+      // Respect the user's chosen crt variant, but still provide it
+      // again as the last linker argument, because some of the libraries
+      // we added above may depend on it.
+      CmdArgs.push_back(Args.MakeArgStringRef(CRTLib));
       return;
+    }
+  }
   CmdArgs.push_back("-lmsvcrt");
 }
 
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 79b1b6960da1..8f589186af34 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -161,7 +161,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Nopie || Profiling)
     CmdArgs.push_back("-nopie");
 
-  if (Triple.isRISCV64()) {
+  if (Triple.isLoongArch64() || Triple.isRISCV64()) {
     CmdArgs.push_back("-X");
     if (Args.hasArg(options::OPT_mno_relax))
       CmdArgs.push_back("--no-relax");
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 4010f7fbd25b..099994695dec 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -628,9 +628,16 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
       // name.
       !Style.isJavaScript() && Previous.isNot(tok::kw_template) &&
       CurrentState.BreakBeforeParameter) {
-    for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous)
-      if (Tok->FirstAfterPPLine || Tok->is(TT_LineComment))
+    for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous) {
+      if (Tok->is(TT_LineComment))
         return false;
+      if (Tok->is(TT_TemplateCloser)) {
+        Tok = Tok->MatchingParen;
+        assert(Tok);
+      }
+      if (Tok->FirstAfterPPLine)
+        return false;
+    }
 
     return true;
   }
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 78c09be458f0..513fcfcd4125 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -727,6 +727,7 @@ template <> struct MappingTraits<FormatStyle::SpaceBeforeParensCustom> {
     IO.mapOptional("AfterFunctionDeclarationName",
                    Spacing.AfterFunctionDeclarationName);
     IO.mapOptional("AfterIfMacros", Spacing.AfterIfMacros);
+    IO.mapOptional("AfterNot", Spacing.AfterNot);
     IO.mapOptional("AfterOverloadedOperator", Spacing.AfterOverloadedOperator);
     IO.mapOptional("AfterPlacementOperator", Spacing.AfterPlacementOperator);
     IO.mapOptional("AfterRequiresInClause", Spacing.AfterRequiresInClause);
@@ -1748,7 +1749,6 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.AttributeMacros.push_back("absl_nullable");
   GoogleStyle.AttributeMacros.push_back("absl_nullability_unknown");
   GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
-  GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^<ext/.*\\.h>", 2, 0, false},
                                                 {"^<.*\\.h>", 1, 0, false},
@@ -1857,6 +1857,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   } else if (Language == FormatStyle::LK_ObjC) {
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
     GoogleStyle.ColumnLimit = 100;
+    GoogleStyle.DerivePointerAlignment = true;
     // "Regroup" doesn't work well for ObjC yet (main header heuristic,
     // relationship between ObjC standard library headers and other heades,
     // #imports, etc.)
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae32b11..aa752f5e3148 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -45,15 +45,18 @@ std::pair<tooling::Replacements, unsigned>
 IntegerLiteralSeparatorFixer::process(const Environment &Env,
                                       const FormatStyle &Style) {
   switch (Style.Language) {
-  case FormatStyle::LK_Cpp:
-  case FormatStyle::LK_ObjC:
-    Separator = '\'';
-    break;
   case FormatStyle::LK_CSharp:
   case FormatStyle::LK_Java:
   case FormatStyle::LK_JavaScript:
     Separator = '_';
     break;
+  case FormatStyle::LK_Cpp:
+  case FormatStyle::LK_ObjC:
+    if (Style.Standard >= FormatStyle::LS_Cpp14) {
+      Separator = '\'';
+      break;
+    }
+    [[fallthrough]];
   default:
     return {};
   }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 739209a5681f..cbeb5ef7e4bf 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2590,6 +2590,9 @@ class AnnotatingParser {
     if (!Tok.Previous || Tok.isNot(tok::identifier) || Tok.is(TT_ClassHeadName))
       return false;
 
+    if (Tok.endsSequence(Keywords.kw_final, TT_ClassHeadName))
+      return false;
+
     if ((Style.isJavaScript() || Style.isJava()) && Tok.is(Keywords.kw_extends))
       return false;
 
@@ -2996,14 +2999,18 @@ class AnnotatingParser {
     const FormatToken *PrevToken = Tok.getPreviousNonComment();
     if (!PrevToken)
       return TT_UnaryOperator;
-    if (PrevToken->is(TT_TypeName))
+    if (PrevToken->isTypeName(LangOpts))
       return TT_PointerOrReference;
     if (PrevToken->isPlacementOperator() && Tok.is(tok::ampamp))
       return TT_BinaryOperator;
 
-    const FormatToken *NextToken = Tok.getNextNonComment();
+    auto *NextToken = Tok.getNextNonComment();
     if (!NextToken)
       return TT_PointerOrReference;
+    if (NextToken->is(tok::greater)) {
+      NextToken->setFinalizedType(TT_TemplateCloser);
+      return TT_PointerOrReference;
+    }
 
     if (InTemplateArgument && NextToken->is(tok::kw_noexcept))
       return TT_BinaryOperator;
@@ -3112,7 +3119,7 @@ class AnnotatingParser {
 
     // It's more likely that & represents operator& than an uninitialized
     // reference.
-    if (Tok.is(tok::amp) && PrevToken && PrevToken->Tok.isAnyIdentifier() &&
+    if (Tok.is(tok::amp) && PrevToken->Tok.isAnyIdentifier() &&
         IsChainedOperatorAmpOrMember(PrevToken->getPreviousNonComment()) &&
         NextToken && NextToken->Tok.isAnyIdentifier()) {
       if (auto NextNext = NextToken->getNextNonComment();
@@ -5474,7 +5481,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if (Left.TokenText == "!")
       return Style.SpaceAfterLogicalNot;
     assert(Left.TokenText == "not");
-    return Right.isOneOf(tok::coloncolon, TT_UnaryOperator);
+    return Right.isOneOf(tok::coloncolon, TT_UnaryOperator) ||
+           (Right.is(tok::l_paren) && Style.SpaceBeforeParensOptions.AfterNot);
   }
 
   // If the next token is a binary operator or a selector name, we have
@@ -6266,7 +6274,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   }
 
   if (Right.is(tok::colon) &&
-      !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon)) {
+      !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon,
+                     TT_BitFieldColon)) {
     return false;
   }
   if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr)) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 08cf0ae6b2c2..08f3b7a7fcc4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1542,6 +1542,17 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
           Discrimination::Constant, InitFiniPointerConstantDiscriminator);
     }
 
+    Opts.BlockInvocationFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockByrefHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    if (LangOpts.PointerAuthBlockDescriptorPointers)
+      Opts.BlockDescriptorPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            BlockDescriptorConstantDiscriminator);
+
     Opts.ObjCMethodListFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
     Opts.ObjCMethodListPointer =
@@ -3598,6 +3609,8 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_objc_interface_sel);
   if (Opts.PointerAuthObjcClassROPointers)
     GenerateArg(Consumer, OPT_fptrauth_objc_class_ro);
+  if (Opts.PointerAuthBlockDescriptorPointers)
+    GenerateArg(Consumer, OPT_fptrauth_block_descriptor_pointers);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
@@ -3621,7 +3634,8 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
   Opts.AArch64JumpTableHardening =
       Args.hasArg(OPT_faarch64_jump_table_hardening);
-
+  Opts.PointerAuthBlockDescriptorPointers =
+      Args.hasArg(OPT_fptrauth_block_descriptor_pointers);
   Opts.PointerAuthObjcIsa = Args.hasArg(OPT_fptrauth_objc_isa);
   Opts.PointerAuthObjcClassROPointers = Args.hasArg(OPT_fptrauth_objc_class_ro);
   Opts.PointerAuthObjcInterfaceSel =
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 34fb825e9d42..cce8392950b0 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1535,6 +1535,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
 #undef TARGET_OS
   }
 
+  if (LangOpts.PointerAuthIntrinsics)
+    Builder.defineMacro("__PTRAUTH__");
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 7e614f7740bf..9d96e36c74ca 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -197,7 +197,7 @@ _mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -218,7 +218,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -239,7 +239,7 @@ _mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -260,7 +260,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -281,7 +281,7 @@ _mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -302,7 +302,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
index 992be18f7720..d5a66cfef536 100644
--- a/clang/lib/Headers/avx10_2niintrin.h
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -253,7 +253,7 @@ _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -266,7 +266,7 @@ _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -279,7 +279,7 @@ _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -292,7 +292,7 @@ _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -305,7 +305,7 @@ _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -318,7 +318,7 @@ _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -331,7 +331,7 @@ _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -344,7 +344,7 @@ _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -357,7 +357,7 @@ _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -370,7 +370,7 @@ _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -383,7 +383,7 @@ _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -396,7 +396,7 @@ _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index 2b7f5043e09e..6206a347852b 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -697,7 +697,16 @@ template <typename _Tp> struct __remove_address_space<__constant _Tp> {
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
-int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+#ifdef __OPENCL_CPP_VERSION__
+#define CLINKAGE extern "C"
+#else
+#define CLINKAGE
+#endif
+
+CLINKAGE int printf(__constant const char *st, ...)
+    __attribute__((format(printf, 1, 2)));
+
+#undef CLINKAGE
 #endif
 
 #ifdef cl_intel_device_side_avc_motion_estimation
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index 7f7d387cbdfd..f902ca1e3bbd 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -95,7 +95,7 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
    __ptrauth qualifier; the compiler will perform this check
    automatically. */
 
-#if __has_feature(ptrauth_intrinsics)
+#if __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__)
 
 /* Strip the signature from a value without authenticating it.
 
@@ -388,6 +388,6 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define __ptrauth_objc_isa_uintptr
 #define __ptrauth_objc_super_pointer
 
-#endif /* __has_feature(ptrauth_intrinsics) */
+#endif /* __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__) */
 
 #endif /* __PTRAUTH_H */
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 1f695b4a8676..b282a600c0e5 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -174,8 +174,6 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
   ExtendedTokenMode = 0;
 
   NewLinePtr = nullptr;
-
-  IsFirstPPToken = true;
 }
 
 /// Lexer constructor - Create a new lexer object for the specified buffer
@@ -3225,7 +3223,6 @@ std::optional<Token> Lexer::peekNextPPToken() {
   bool atStartOfLine = IsAtStartOfLine;
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   bool leadingSpace = HasLeadingSpace;
-  bool isFirstPPToken = IsFirstPPToken;
 
   Token Tok;
   Lex(Tok);
@@ -3236,7 +3233,6 @@ std::optional<Token> Lexer::peekNextPPToken() {
   HasLeadingSpace = leadingSpace;
   IsAtStartOfLine = atStartOfLine;
   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
-  IsFirstPPToken = isFirstPPToken;
   // Restore the lexer back to non-skipping mode.
   LexingRawMode = false;
 
@@ -3726,11 +3722,6 @@ bool Lexer::Lex(Token &Result) {
     HasLeadingEmptyMacro = false;
   }
 
-  if (IsFirstPPToken) {
-    Result.setFlag(Token::FirstPPToken);
-    IsFirstPPToken = false;
-  }
-
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   IsAtPhysicalStartOfLine = false;
   bool isRawLex = isLexingRawMode();
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index a62508e3e27b..5b08d7f0efe5 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1467,7 +1467,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
   if (s != PossibleNewDigitStart)
     DigitsBegin = PossibleNewDigitStart;
   else
-    IsSingleZero = (s == ThisTokEnd); // Is the only thing we've seen a 0?
+    IsSingleZero = (s == ThisTokBegin + 1);
 
   if (s == ThisTokEnd)
     return; // Done, simple octal number like 01234
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index bcd3ea60ce3d..2120e45dd8f8 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -43,6 +43,7 @@
 #include "clang/Lex/MacroArgs.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/ModuleLoader.h"
+#include "clang/Lex/NoTrivialPPDirectiveTracer.h"
 #include "clang/Lex/Pragma.h"
 #include "clang/Lex/PreprocessingRecord.h"
 #include "clang/Lex/PreprocessorLexer.h"
@@ -247,8 +248,6 @@ void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
     llvm::errs() << " [LeadingSpace]";
   if (Tok.isExpandDisabled())
     llvm::errs() << " [ExpandDisabled]";
-  if (Tok.isFirstPPToken())
-    llvm::errs() << " [First pp-token]";
   if (Tok.needsCleaning()) {
     const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
     llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
@@ -577,8 +576,11 @@ void Preprocessor::EnterMainSourceFile() {
     // export module M; // error: module declaration must occur
     //                  //        at the start of the translation unit.
     if (getLangOpts().CPlusPlusModules) {
+      auto Tracer = std::make_unique<NoTrivialPPDirectiveTracer>(*this);
+      DirTracer = Tracer.get();
+      addPPCallbacks(std::move(Tracer));
       std::optional<Token> FirstPPTok = CurLexer->peekNextPPToken();
-      if (FirstPPTok && FirstPPTok->isFirstPPToken())
+      if (FirstPPTok)
         FirstPPTokenLoc = FirstPPTok->getLocation();
     }
   }
@@ -940,6 +942,8 @@ void Preprocessor::Lex(Token &Result) {
       StdCXXImportSeqState.handleHeaderName();
       break;
     case tok::kw_export:
+      if (hasSeenNoTrivialPPDirective())
+        Result.setFlag(Token::HasSeenNoTrivialPPDirective);
       TrackGMFState.handleExport();
       StdCXXImportSeqState.handleExport();
       ModuleDeclState.handleExport();
@@ -966,6 +970,8 @@ void Preprocessor::Lex(Token &Result) {
           }
           break;
         } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) {
+          if (hasSeenNoTrivialPPDirective())
+            Result.setFlag(Token::HasSeenNoTrivialPPDirective);
           TrackGMFState.handleModule(StdCXXImportSeqState.afterTopLevelSeq());
           ModuleDeclState.handleModule();
           break;
@@ -1680,3 +1686,31 @@ const char *Preprocessor::getCheckPoint(FileID FID, const char *Start) const {
 
   return nullptr;
 }
+
+bool Preprocessor::hasSeenNoTrivialPPDirective() const {
+  return DirTracer && DirTracer->hasSeenNoTrivialPPDirective();
+}
+
+bool NoTrivialPPDirectiveTracer::hasSeenNoTrivialPPDirective() const {
+  return SeenNoTrivialPPDirective;
+}
+
+void NoTrivialPPDirectiveTracer::setSeenNoTrivialPPDirective() {
+  if (InMainFile && !SeenNoTrivialPPDirective)
+    SeenNoTrivialPPDirective = true;
+}
+
+void NoTrivialPPDirectiveTracer::LexedFileChanged(
+    FileID FID, LexedFileChangeReason Reason,
+    SrcMgr::CharacteristicKind FileType, FileID PrevFID, SourceLocation Loc) {
+  InMainFile = (FID == PP.getSourceManager().getMainFileID());
+}
+
+void NoTrivialPPDirectiveTracer::MacroExpands(const Token &MacroNameTok,
+                                              const MacroDefinition &MD,
+                                              SourceRange Range,
+                                              const MacroArgs *Args) {
+  // FIXME: Does only enable builtin macro expansion make sense?
+  if (!MD.getMacroInfo()->isBuiltinMacro())
+    setSeenNoTrivialPPDirective();
+}
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index bc238a9517a3..3515343202de 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3342,7 +3342,8 @@ ExprResult Parser::ParseBlockLiteralExpression() {
     Actions.ActOnBlockError(CaretLoc, getCurScope());
     return ExprError();
   }
-
+  EnterExpressionEvaluationContextForFunction PotentiallyEvaluated(
+       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
   StmtResult Stmt(ParseCompoundStatementBody());
   BlockScope.Exit();
   if (!Stmt.isInvalid())
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 8834bf80c401..447d0283431b 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2361,9 +2361,10 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (!Introducer.isFirstPPToken()) {
+    if (ImportState != Sema::ModuleImportState::FirstDecl ||
+        Introducer.hasSeenNoTrivialPPDirective()) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
-        << SourceRange(StartLoc, SemiLoc);
+          << SourceRange(StartLoc, SemiLoc);
       return nullptr;
     }
     if (MDK == Sema::ModuleDeclKind::Interface) {
@@ -2418,7 +2419,8 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
   return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, Partition,
-                                 ImportState, Introducer.isFirstPPToken());
+                                 ImportState,
+                                 Introducer.hasSeenNoTrivialPPDirective());
 }
 
 Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 5e75c64eb2b9..ec39bca6039f 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -503,8 +503,12 @@ static bool areAllValuesNoReturn(const VarDecl *VD, const CFGBlock &VarBlk,
 
   TransferFunctions TF(VD);
   BackwardDataflowWorklist Worklist(*AC.getCFG(), AC);
+  llvm::DenseSet<const CFGBlock *> Visited;
   Worklist.enqueueBlock(&VarBlk);
   while (const CFGBlock *B = Worklist.dequeue()) {
+    if (Visited.contains(B))
+      continue;
+    Visited.insert(B);
     // First check the current block.
     for (CFGBlock::const_reverse_iterator ri = B->rbegin(), re = B->rend();
          ri != re; ++ri) {
@@ -2887,8 +2891,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
       .setAlwaysAdd(Stmt::UnaryOperatorClass);
   }
 
-  bool EnableLifetimeSafetyAnalysis = !Diags.isIgnored(
-      diag::warn_experimental_lifetime_safety_dummy_warning, D->getBeginLoc());
+  bool EnableLifetimeSafetyAnalysis = S.getLangOpts().EnableLifetimeSafety;
   // Install the logical handler.
   std::optional<LogicalErrorHandler> LEH;
   if (LogicalErrorHandler::hasActiveDiagnostics(Diags, D->getBeginLoc())) {
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 834417f8e15a..20567a6d9d1a 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1097,10 +1097,6 @@ static bool CheckFunctionConstraintsWithoutInstantiation(
   }
 
   Sema::ContextRAII SavedContext(SemaRef, FD);
-  std::optional<Sema::CXXThisScopeRAII> ThisScope;
-  if (auto *Method = dyn_cast<CXXMethodDecl>(FD))
-    ThisScope.emplace(SemaRef, /*Record=*/Method->getParent(),
-                      /*ThisQuals=*/Method->getMethodQualifiers());
   return SemaRef.CheckConstraintSatisfaction(
       Template, TemplateAC, MLTAL, PointOfInstantiation, Satisfaction);
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 14403e65e8f4..bb412ef6788e 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3267,6 +3267,14 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
     if (isa<UsedAttr>(I) || isa<RetainAttr>(I))
       continue;
 
+    if (isa<InferredNoReturnAttr>(I)) {
+      if (auto *FD = dyn_cast<FunctionDecl>(New)) {
+        if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+          continue; // Don't propagate inferred noreturn attributes to explicit
+                    // specializations.
+      }
+    }
+
     if (mergeDeclAttribute(*this, New, I, LocalAMK))
       foundAny = true;
   }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5f481ed1f713..a7897bdfe6e0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1970,6 +1970,13 @@ void clang::inferNoReturnAttr(Sema &S, const Decl *D) {
   if (!FD)
     return;
 
+  // Skip explicit specializations here as they may have
+  // a user-provided definition that may deliberately differ from the primary
+  // template. If an explicit specialization truly never returns, the user
+  // should explicitly mark it with [[noreturn]].
+  if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+    return;
+
   auto *NonConstFD = const_cast<FunctionDecl *>(FD);
   DiagnosticsEngine &Diags = S.getDiagnostics();
   if (Diags.isIgnored(diag::warn_falloff_nonvoid, FD->getLocation()) &&
@@ -5011,10 +5018,10 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
 
 static void handleNonStringAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   // This only applies to fields and variable declarations which have an array
-  // type.
+  // type or pointer type, with character elements.
   QualType QT = cast<ValueDecl>(D)->getType();
-  if (!QT->isArrayType() ||
-      !QT->getBaseElementTypeUnsafe()->isAnyCharacterType()) {
+  if ((!QT->isArrayType() && !QT->isPointerType()) ||
+      !QT->getPointeeOrArrayElementType()->isAnyCharacterType()) {
     S.Diag(D->getBeginLoc(), diag::warn_attribute_non_character_array)
         << AL << AL.isRegularKeywordAttribute() << QT << AL.getRange();
     return;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 728ada33e2e6..5421e9562c8b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9309,14 +9309,14 @@ AssignConvertType Sema::CheckAssignmentConstraints(QualType LHSType,
   // If we have an atomic type, try a non-atomic assignment, then just add an
   // atomic qualification step.
   if (const AtomicType *AtomicTy = dyn_cast<AtomicType>(LHSType)) {
-    AssignConvertType result =
+    AssignConvertType Result =
         CheckAssignmentConstraints(AtomicTy->getValueType(), RHS, Kind);
-    if (result != AssignConvertType::Compatible)
-      return result;
+    if (!IsAssignConvertCompatible(Result))
+      return Result;
     if (Kind != CK_NoOp && ConvertRHS)
       RHS = ImpCastExprToType(RHS.get(), AtomicTy->getValueType(), Kind);
     Kind = CK_NonAtomicToAtomic;
-    return AssignConvertType::Compatible;
+    return Result;
   }
 
   // If the left-hand side is a reference type, then we are in a
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 7c982bcd63d7..10a390286ec7 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -264,10 +264,11 @@ Sema::DeclGroupPtrTy
 Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
                       ModuleDeclKind MDK, ModuleIdPath Path,
                       ModuleIdPath Partition, ModuleImportState &ImportState,
-                      bool IntroducerIsFirstPPToken) {
+                      bool SeenNoTrivialPPDirective) {
   assert(getLangOpts().CPlusPlusModules &&
          "should only have module decl in standard C++ modules");
 
+  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
   bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
   // If any of the steps here fail, we count that as invalidating C++20
   // module state;
@@ -335,7 +336,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
 
   // In C++20, A module directive may only appear as the first preprocessing
   // tokens in a file (excluding the global module fragment.).
-  if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) {
+  if (getLangOpts().CPlusPlusModules &&
+      (!IsFirstDecl || SeenNoTrivialPPDirective) && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
     SourceLocation BeginLoc = PP.getMainFileFirstPPTokenLoc();
     Diag(BeginLoc, diag::note_global_module_introducer_missing)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f3baf0c3ef3b..1b54628c5e56 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -245,7 +245,6 @@ void StandardConversionSequence::setAsIdentityConversion() {
   IsLvalueReference = true;
   BindsToFunctionLvalue = false;
   BindsToRvalue = false;
-  IsImplicitObjectArgumentQualificationConversion = false;
   BindsImplicitObjectArgumentWithoutRefQualifier = false;
   ObjCLifetimeConversionBinding = false;
   FromBracedInitList = false;
@@ -5318,7 +5317,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.DirectBinding = BindsDirectly;
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = T2->isFunctionType();
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsToRvalue = InitCategory.isRValue();
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding =
@@ -5498,7 +5496,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = false;
     ICS.Standard.BindsToRvalue = true;
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding = false;
   } else if (ICS.isUserDefined()) {
@@ -5521,8 +5518,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.UserDefined.After.IsLvalueReference = !isRValRef;
     ICS.UserDefined.After.BindsToFunctionLvalue = false;
     ICS.UserDefined.After.BindsToRvalue = !LValRefType;
-    ICS.UserDefined.After.IsImplicitObjectArgumentQualificationConversion =
-        false;
     ICS.UserDefined.After.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.UserDefined.After.ObjCLifetimeConversionBinding = false;
     ICS.UserDefined.After.FromBracedInitList = false;
@@ -5807,7 +5802,6 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       StandardConversionSequence &SCS = Result.isStandard() ? Result.Standard :
                                             Result.UserDefined.After;
       SCS.ReferenceBinding = true;
-      SCS.IsImplicitObjectArgumentQualificationConversion = false;
       SCS.IsLvalueReference = ToType->isLValueReferenceType();
       SCS.BindsToRvalue = true;
       SCS.BindsToFunctionLvalue = false;
@@ -6005,12 +5999,8 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   // affects the conversion rank.
   QualType ClassTypeCanon = S.Context.getCanonicalType(ClassType);
   ImplicitConversionKind SecondKind;
-  bool IsQualificationConversion = false;
-  if (ImplicitParamType.getCanonicalType() == FromTypeCanon) {
+  if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
     SecondKind = ICK_Identity;
-  } else if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
-    SecondKind = ICK_Identity;
-    IsQualificationConversion = true;
   } else if (S.IsDerivedFrom(Loc, FromType, ClassType)) {
     SecondKind = ICK_Derived_To_Base;
   } else if (!Method->isExplicitObjectMemberFunction()) {
@@ -6051,8 +6041,6 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   ICS.Standard.setFromType(FromType);
   ICS.Standard.setAllToTypes(ImplicitParamType);
   ICS.Standard.ReferenceBinding = true;
-  ICS.Standard.IsImplicitObjectArgumentQualificationConversion =
-      IsQualificationConversion;
   ICS.Standard.DirectBinding = true;
   ICS.Standard.IsLvalueReference = Method->getRefQualifier() != RQ_RValue;
   ICS.Standard.BindsToFunctionLvalue = false;
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f85826aecadf..f46be75bda20 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -2287,7 +2287,11 @@ StmtResult Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
           // we can diagnose if we don't see any variable declarations. This
           // covers a case like declaring a typedef, function, or structure
           // type rather than a variable.
-          NonVarSeen = DI;
+          //
+          // Note, _Static_assert is acceptable because it does not declare an
+          // identifier at all, so "for object having" does not apply.
+          if (!isa<StaticAssertDecl>(DI))
+            NonVarSeen = DI;
         }
       }
       // Diagnose if we saw a non-variable declaration but no variable
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b76619fc5026..f067aedf1cc9 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4749,8 +4749,6 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
   EnterExpressionEvaluationContext EECtx{
       *this, ExpressionEvaluationContext::Unevaluated, CSD};
 
-  ContextRAII CurContext(*this, CSD->getDeclContext(),
-                         /*NewThisContext=*/false);
   if (!AreArgsDependent &&
       CheckConstraintSatisfaction(
           NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index d09a72b71b80..ce78ecc2d4a2 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5525,6 +5525,15 @@ static TemplateDeductionResult CheckDeductionConsistency(
   // FIXME: A substitution can be incomplete on a non-structural part of the
   // type. Use the canonical type for now, until the TemplateInstantiator can
   // deal with that.
+
+  // Workaround: Implicit deduction guides use InjectedClassNameTypes, whereas
+  // the explicit guides don't. The substitution doesn't transform these types,
+  // so let it transform their specializations instead.
+  bool IsDeductionGuide = isa<CXXDeductionGuideDecl>(FTD->getTemplatedDecl());
+  if (IsDeductionGuide) {
+    if (auto *Injected = P->getAs<InjectedClassNameType>())
+      P = Injected->getInjectedSpecializationType();
+  }
   QualType InstP = S.SubstType(P.getCanonicalType(), MLTAL, FTD->getLocation(),
                                FTD->getDeclName(), &IsIncompleteSubstitution);
   if (InstP.isNull() && !IsIncompleteSubstitution)
@@ -5539,9 +5548,15 @@ static TemplateDeductionResult CheckDeductionConsistency(
   if (auto *PA = dyn_cast<PackExpansionType>(A);
       PA && !isa<PackExpansionType>(InstP))
     A = PA->getPattern();
-  if (!S.Context.hasSameType(
-          S.Context.getUnqualifiedArrayType(InstP.getNonReferenceType()),
-          S.Context.getUnqualifiedArrayType(A.getNonReferenceType())))
+  auto T1 = S.Context.getUnqualifiedArrayType(InstP.getNonReferenceType());
+  auto T2 = S.Context.getUnqualifiedArrayType(A.getNonReferenceType());
+  if (IsDeductionGuide) {
+    if (auto *Injected = T1->getAs<InjectedClassNameType>())
+      T1 = Injected->getInjectedSpecializationType();
+    if (auto *Injected = T2->getAs<InjectedClassNameType>())
+      T2 = Injected->getInjectedSpecializationType();
+  }
+  if (!S.Context.hasSameType(T1, T2))
     return TemplateDeductionResult::NonDeducedMismatch;
   return TemplateDeductionResult::Success;
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e2c3cdcd536b..d2b87f2702a9 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -5685,7 +5685,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   };
   Function->setDeclarationNameLoc(NameLocPointsToPattern());
 
-  EnterExpressionEvaluationContext EvalContext(
+  EnterExpressionEvaluationContextForFunction EvalContext(
       *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   Qualifiers ThisTypeQuals;
diff --git a/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
index 1e3adb4f266c..789c7772d123 100644
--- a/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
@@ -45,7 +45,6 @@ void AssumeModelingChecker::checkPostStmt(const AttributedStmt *A,
       continue;
 
     const auto *Assumption = AssumptionVal.getAsInteger();
-    assert(Assumption && "We should know the exact outcome of an assume expr");
     if (Assumption && Assumption->isZero()) {
       C.addSink();
     }
diff --git a/clang/test/ASTMerge/enum/Inputs/enum3.c b/clang/test/ASTMerge/enum/Inputs/enum3.c
new file mode 100644
index 000000000000..32ad5366434a
--- /dev/null
+++ b/clang/test/ASTMerge/enum/Inputs/enum3.c
@@ -0,0 +1,14 @@
+// [C23] missing underlying types
+enum E1 : int {
+  E1Enumerator1
+};
+
+enum E2 {
+  E2Enumerator1
+};
+
+// [C23] Incompatible underlying types
+enum E3 : long {
+  E3Enumerator1
+};
+
diff --git a/clang/test/ASTMerge/enum/Inputs/enum4.c b/clang/test/ASTMerge/enum/Inputs/enum4.c
new file mode 100644
index 000000000000..15f5c603c7ab
--- /dev/null
+++ b/clang/test/ASTMerge/enum/Inputs/enum4.c
@@ -0,0 +1,14 @@
+// [C23] missing underlying types
+enum E1 {
+  E1Enumerator1
+};
+
+enum E2 : int {
+  E2Enumerator1
+};
+
+// [C23] Incompatible underlying types
+enum E3 : short {
+  E3Enumerator1
+};
+
diff --git a/clang/test/ASTMerge/enum/test2.c b/clang/test/ASTMerge/enum/test2.c
new file mode 100644
index 000000000000..bdd8b13ee4c2
--- /dev/null
+++ b/clang/test/ASTMerge/enum/test2.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c23 -emit-pch -o %t.1.ast %S/Inputs/enum3.c
+// RUN: %clang_cc1 -std=c23 -emit-pch -o %t.2.ast %S/Inputs/enum4.c
+// RUN: %clang_cc1 -std=c23 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
+
+// CHECK: enum3.c:2:6: warning: type 'enum E1' has incompatible definitions in different translation units
+// CHECK: enum4.c:2:6: note: enumeration 'E1' missing fixed underlying type here
+// CHECK: enum3.c:2:6: note: enumeration 'E1' has fixed underlying type here
+// CHECK: enum3.c:6:6: warning: type 'enum E2' has incompatible definitions in different translation units
+// CHECK: enum4.c:6:6: note: enumeration 'E2' has fixed underlying type here
+// CHECK: enum3.c:6:6: note: enumeration 'E2' missing fixed underlying type here
+// CHECK: enum3.c:11:6: warning: type 'enum E3' has incompatible definitions in different translation units
+// CHECK: enum3.c:11:6: note: enumeration 'E3' declared with incompatible fixed underlying types ('long' vs. 'short')
+// CHECK: 3 warnings generated
+
diff --git a/clang/test/Analysis/cxx23-assume-attribute.cpp b/clang/test/Analysis/cxx23-assume-attribute.cpp
index 86e7662cd2af..4cc16446572d 100644
--- a/clang/test/Analysis/cxx23-assume-attribute.cpp
+++ b/clang/test/Analysis/cxx23-assume-attribute.cpp
@@ -69,3 +69,9 @@ int assume_and_fallthrough_at_the_same_attrstmt(int a, int b) {
 
   return 0;
 }
+
+void assume_opaque_gh151854_no_crash() {
+  extern bool opaque();
+  [[assume(opaque())]]; // no-crash
+  // expected-warning@-1 {{assumption is ignored because it contains (potential) side-effects}}
+}
diff --git a/clang/test/C/C23/n3037.c b/clang/test/C/C23/n3037.c
index ce6f4c4ea7ac..d5699e2df142 100644
--- a/clang/test/C/C23/n3037.c
+++ b/clang/test/C/C23/n3037.c
@@ -401,3 +401,77 @@ _Static_assert(0 == _Generic(inner_anon_tagged.untagged, struct { int i; } : 1,
 // unions and structures are both RecordDecl objects, whereas EnumDecl is not).
 enum { E_Untagged1 } nontag_enum; // both-note {{previous definition is here}}
 _Static_assert(0 == _Generic(nontag_enum, enum { E_Untagged1 } : 1, default : 0)); // both-error {{redefinition of enumerator 'E_Untagged1'}}
+
+// Test that enumerations are compatible with their underlying type, but still
+// diagnose when "same type" is required rather than merely "compatible type".
+enum E1 : int { e1 }; // Fixed underlying type
+enum E2 { e2 };       // Unfixed underlying type, defaults to int or unsigned int
+
+struct GH149965_1 { int h; };
+// This typeof trick is used to get the underlying type of the enumeration in a
+// platform agnostic way.
+struct GH149965_2 { __typeof__(+(enum E2){}) h; };
+void gh149965(void) {
+  extern struct GH149965_1 x1; // c17-note {{previous declaration is here}}
+  extern struct GH149965_2 x2; // c17-note {{previous declaration is here}}
+
+  // Both the structure and the variable declarations are fine because only a
+  // compatible type is required, not the same type, because the structures are
+  // declared in different scopes.
+  struct GH149965_1 { enum E1 h; };
+  struct GH149965_2 { enum E2 h; };
+
+  extern struct GH149965_1 x1; // c17-error {{redeclaration of 'x1'}}
+  extern struct GH149965_2 x2; // c17-error {{redeclaration of 'x2'}}
+
+  // However, in the same scope, the same type is required, not just compatible
+  // types.
+  // FIXME: this should be an error in both C17 and C23 mode.
+  struct GH149965_3 { int h; };     // c17-note {{previous definition is here}}
+  struct GH149965_3 { enum E1 h; }; // c17-error {{redefinition of 'GH149965_3'}}
+
+  // For Clang, the composite type after declaration merging is the enumeration
+  // type rather than an integer type.
+  enum E1 *eptr;
+  [[maybe_unused]] __typeof__(x1.h) *ptr = eptr;
+  enum E2 *eptr2;
+  [[maybe_unused]] __typeof__(x2.h) *ptr2 = eptr2;
+}
+
+// Test that enumerations with mixed underlying types are properly handled.
+enum GH150594_E1 : int { GH150594_Val1 };
+enum GH150594_E2 : int { GH150594_Val2 };
+enum GH150594_E3 { GH150594_Val3 };
+enum GH150594_E4 : int { GH150594_Val4 };
+void GH150594(void) {
+  extern enum GH150594_E1 Fn1(void); // both-note {{previous declaration is here}}
+  extern enum GH150594_E2 Fn2(void); // c17-note {{previous declaration is here}}
+  extern enum GH150594_E3 Fn3(void); // both-note {{previous declaration is here}}
+  extern enum GH150594_E4 Fn4(void); // both-note {{previous declaration is here}}
+  enum GH150594_E1 { GH150594_Val1 };
+  enum GH150594_E2 : int { GH150594_Val2 };
+  enum GH150594_E3 : int { GH150594_Val3 };
+  enum GH150594_E4 : short { GH150594_Val4 };
+  extern enum GH150594_E1 Fn1(void); // both-error {{conflicting types for 'Fn1'}}
+  extern enum GH150594_E2 Fn2(void); // c17-error {{conflicting types for 'Fn2'}}
+  extern enum GH150594_E3 Fn3(void); // both-error {{conflicting types for 'Fn3'}}
+  extern enum GH150594_E4 Fn4(void); // both-error {{conflicting types for 'Fn4'}}
+
+  // Show that two declarations in the same scope give expected diagnostics.
+  enum E1 { e1 };       // both-note {{previous declaration is here}}
+  enum E1 : int { e1 }; // both-error {{enumeration previously declared with nonfixed underlying type}}
+
+  enum E2 : int { e2 }; // both-note {{previous declaration is here}}
+  enum E2 { e2 };       // both-error {{enumeration previously declared with fixed underlying type}}
+
+  enum E3 : int { e3 };   // both-note {{previous declaration is here}}
+  enum E3 : short { e3 }; // both-error {{enumeration redeclared with different underlying type 'short' (was 'int')}}
+
+  typedef short foo;
+  enum E4 : foo { e4 };   // c17-note 2 {{previous definition is here}}
+  enum E4 : short { e4 }; // c17-error {{redefinition of 'E4'}} \
+                             c17-error {{redefinition of enumerator 'e4'}}
+
+  enum E5 : foo { e5 }; // both-note {{previous declaration is here}}
+  enum E5 : int { e5 }; // both-error {{enumeration redeclared with different underlying type 'int' (was 'foo' (aka 'short'))}}
+}
diff --git a/clang/test/C/C2y/n3353.c b/clang/test/C/C2y/n3353.c
index cd61cbf03906..a2e08cf6344d 100644
--- a/clang/test/C/C2y/n3353.c
+++ b/clang/test/C/C2y/n3353.c
@@ -44,7 +44,12 @@ static const void *ptr = 0o0;  /* ext-warning {{octal integer literals are a C2y
 #endif
 
 // 0 by itself is not deprecated, of course.
-int k = 0;
+int k1                = 0;
+unsigned int k2       = 0u;
+long k3               = 0l;
+unsigned long k4      = 0ul;
+long long k5          = 0ll;
+unsigned long long k6 = 0ull;
 
 // Test a preprocessor use of 0 by itself, which is also not deprecated.
 #if 0
@@ -65,7 +70,6 @@ static_assert(__extension__ _Generic(typeof(l), const int : 1, default : 0)); //
 
 // Note that 0o by itself is an invalid literal.
 int m = 0o; /* expected-error {{invalid suffix 'o' on integer constant}}
-               c2y-warning {{octal literals without a '0o' prefix are deprecated}}
              */
 
 // Ensure negation works as expected.
@@ -83,13 +87,11 @@ int n = 0o18; /* expected-error {{invalid digit '8' in octal constant}}
                  cpp-warning {{octal integer literals are a Clang extension}}
                */
 int o1 = 0o8; /* expected-error {{invalid suffix 'o8' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 // FIXME: however, it matches the behavior for hex literals in terms of the
 // error reported. Unfortunately, we then go on to think 0 is an octal literal
 // without a prefix, which is again a bit confusing.
 int o2 = 0xG; /* expected-error {{invalid suffix 'xG' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 
 // Show that floating-point suffixes on octal literals are rejected.
@@ -130,7 +132,6 @@ constexpr int p = 0o0'1'2'3'4'5'6'7; /* compat-warning {{octal integer literals
                                       */
 static_assert(p == 01234567); // c2y-warning {{octal literals without a '0o' prefix are deprecated}}
 int q = 0o'0'1; /* expected-error {{invalid suffix 'o'0'1' on integer constant}}
-                   c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                  */
 
 #define M 0o123
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 58080014abc5..286c9d40d2da 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -226,17 +226,6 @@ add_custom_target(clang-test)
 add_dependencies(clang-test check-clang)
 set_target_properties(clang-test PROPERTIES FOLDER "Clang/Tests")
 
-# Allow running Clang Python binding tests separately from CI.
-add_lit_testsuite(check-clang-python "Running the Clang Python tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  #LIT ${LLVM_LIT}
-  PARAMS ${CLANG_TEST_PARAMS}
-  DEPENDS ${CLANG_TEST_DEPS}
-  ARGS ${CLANG_TEST_EXTRA_ARGS} --filter=bindings.sh
-  # Avoid running tests twice.
-  EXCLUDE_FROM_CHECK_ALL
-  )
-
 # FIXME: This logic can be removed once all buildbots have moved
 # debuginfo-test from clang/test to llvm/projects or monorepo.
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests)
diff --git a/clang/test/CXX/module/cpp.pre/module_decl.cpp b/clang/test/CXX/module/cpp.pre/module_decl.cpp
index 6238347c167a..5c29aeff1b63 100644
--- a/clang/test/CXX/module/cpp.pre/module_decl.cpp
+++ b/clang/test/CXX/module/cpp.pre/module_decl.cpp
@@ -1,8 +1,147 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -verify -o %t/M.pcm
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/line.cpp -verify -o %t/line.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/gnu_line_marker.cpp -verify -o %t/gnu_line_marker.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/include.cpp -verify -o %t/include.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/ident.cpp -verify -o %t/ident.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_comment.cpp -verify -o %t/pragma_comment.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_mark.cpp -verify -o %t/pragma_mark.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_detect_mismatch.cpp -verify -o %t/pragma_detect_mismatch.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_debug.cpp -verify -o %t/pragma_clang_debug.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_message.cpp -verify -o %t/pragma_message.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_warn.cpp -verify -o %t/pragma_gcc_warn.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_error.cpp -verify -o %t/pragma_gcc_error.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_push_pop.cpp -verify -o %t/pragma_diag_push_pop.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_ignore.cpp -verify -o %t/pragma_diag_ignore.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_opencl_ext.cpp -verify -o %t/pragma_opencl_ext.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_push_pop.cpp -verify -o %t/pragma_push_pop.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_exec_charset.cpp -verify -o %t/pragma_exec_charset.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_assume_nonnull.cpp -verify -o %t/pragma_clang_assume_nonnull.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/marco_expand.cpp -DMACRO="" -verify -o %t/marco_expand.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/define.cpp -verify -o %t/define.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/undef.cpp -verify -o %t/undef.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/defined.cpp -verify -o %t/defined.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_embed.cpp -verify -o %t/has_embed.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_include.cpp -verify -o %t/has_include.pcm
 
+//--- header.h
+#ifndef HEADER_H
+#define HEADER_H
+
+#endif // HEADER_H
+
+//--- line.cpp
+// expected-no-diagnostics
+#line 3
+export module M;
+
+//--- gnu_line_marker.cpp
+// expected-no-diagnostics
+# 1 __FILE__ 1 3
+export module M;
+
+//--- include.cpp
+#include "header.h" // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- ident.cpp
+// expected-no-diagnostics
+#ident "$Header:$"
+export module M;
+
+//--- pragma_comment.cpp
+// expected-no-diagnostics
+#pragma comment(lib, "msvcrt.lib")
+export module M;
+
+//--- pragma_mark.cpp
+// expected-no-diagnostics
+#pragma mark LLVM's world
+export module M;
+
+//--- pragma_detect_mismatch.cpp
+// expected-no-diagnostics
+#pragma detect_mismatch("test", "1")
+export module M;
+
+//--- pragma_clang_debug.cpp
+// expected-no-diagnostics
+#pragma clang __debug dump Test
+export module M;
+
+//--- pragma_message.cpp
+#pragma message "test" // expected-warning {{test}}
+export module M;
+
+//--- pragma_gcc_warn.cpp
+#pragma GCC warning "Foo" // expected-warning {{Foo}}
+export module M;
+
+//--- pragma_gcc_error.cpp
+#pragma GCC error "Foo" // expected-error {{Foo}}
+export module M;
+
+//--- pragma_diag_push_pop.cpp
+// expected-no-diagnostics
+#pragma gcc diagnostic push
+#pragma gcc diagnostic pop
+export module M;
+
+//--- pragma_diag_ignore.cpp
+// expected-no-diagnostics
+#pragma GCC diagnostic ignored "-Wframe-larger-than"
+export module M;
+
+//--- pragma_opencl_ext.cpp
+// expected-no-diagnostics
+#pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable
+export module M;
+
+//--- pragma_push_pop.cpp
+// expected-no-diagnostics
+#pragma warning(push)
+#pragma warning(pop)
+export module M;
+
+//--- pragma_exec_charset.cpp
+// expected-no-diagnostics
+#pragma execution_character_set(push, "UTF-8")
+#pragma execution_character_set(pop)
+export module M;
+
+//--- pragma_clang_assume_nonnull.cpp
+// expected-no-diagnostics
+#pragma clang assume_nonnull begin
+#pragma clang assume_nonnull end
+export module M;
+
+//--- marco_expand.cpp
+MACRO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- define.cpp
 // This is a comment
 #define I32 int // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
 export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
 export I32 i32;
+
+//--- undef.cpp
+#undef FOO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- defined.cpp
+#if defined(FOO) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- has_embed.cpp
+#if __has_embed(__FILE__ ext::token(0xB055)) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- has_include.cpp
+#if __has_include(<stdio.h>) || __has_include_next(<stdlib.h>) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} \
+                                                               // expected-warning {{#include_next in primary source file; will search from start of include path}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
diff --git a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
index abb42447d3e0..05830de9891f 100644
--- a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
+++ b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
@@ -239,5 +239,21 @@ void f2() {
 
 }
 
+namespace GH153884 {
+  bool f1() {
+    auto f = [](auto) { return true; };
+    if constexpr (0)
+      return f(1);
+    return false;
+  }
+  bool f2() {
+    auto f = [](auto x) { if (x) return 1.5; else return "wat"; };
+    // expected-error@-1 {{'auto' in return type deduced as 'const char *' here but deduced as 'double' in earlier return statement}}
+    if constexpr (0)
+      return f(1);
+    // expected-note@-1 {{in instantiation of function template specialization 'GH153884::f2()}}
+    return false;
+  }
+}
 
 #endif
diff --git a/clang/test/CodeGen/LoongArch/targetattr-lasx.c b/clang/test/CodeGen/LoongArch/targetattr-lasx.c
new file mode 100644
index 000000000000..56fd6573ed34
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/targetattr-lasx.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature -lsx -emit-llvm %s -o - | FileCheck %s
+
+__attribute__((target("lasx")))
+// CHECK: #[[ATTR0:[0-9]+]] {
+void testlasx() {}
+
+// CHECK: attributes #[[ATTR0]] = { {{.*}}"target-features"="+64bit,+lasx,+lsx"{{.*}} }
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index 26e0d124c828..d143188ee0f3 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -187,12 +187,12 @@ __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -208,12 +208,12 @@ __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -229,12 +229,12 @@ __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -250,12 +250,12 @@ __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -271,12 +271,12 @@ __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -292,10 +292,10 @@ __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index 936be27da61d..b4b12c953194 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -264,11 +264,11 @@ __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -278,11 +278,11 @@ __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -292,11 +292,11 @@ __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -306,11 +306,11 @@ __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -320,11 +320,11 @@ __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -334,11 +334,11 @@ __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -348,11 +348,11 @@ __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -362,11 +362,11 @@ __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -376,11 +376,11 @@ __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -390,11 +390,11 @@ __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -404,11 +404,11 @@ __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -418,9 +418,9 @@ __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/ptrauth-qualifier-blocks.c b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
index 62da59cf327f..f460da205cac 100644
--- a/clang/test/CodeGen/ptrauth-qualifier-blocks.c
+++ b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
@@ -82,9 +82,15 @@ void test_block_address_byref_capture() {
   // CHECK: store i32 33554432,
   // CHECK: store i32 48,
   // CHECK: [[COPY_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 4
-  // CHECK: store ptr @__Block_byref_object_copy_, ptr [[COPY_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[COPY_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_copy_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[COPY_HELPER_FIELD]], align
   // CHECK: [[DISPOSE_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 5
-  // CHECK: store ptr @__Block_byref_object_dispose_, ptr [[DISPOSE_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[DISPOSE_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_dispose_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[DISPOSE_HELPER_FIELD]], align
   //   flags - copy/dispose required
   // CHECK: store i32 1107296256, ptr
   __block struct A * __ptrauth(1, 1, 60) ptr = createA();
diff --git a/clang/test/CodeGenCXX/debug-info-class.cpp b/clang/test/CodeGenCXX/debug-info-class.cpp
index 0bc4fdaa565c..aa24a63c58cb 100644
--- a/clang/test/CodeGenCXX/debug-info-class.cpp
+++ b/clang/test/CodeGenCXX/debug-info-class.cpp
@@ -99,12 +99,12 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKCOFF %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKCOFF %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKELF %s
 
 // CHECK98: invoke {{.+}} @_ZN1BD1Ev(ptr {{[^,]*}} %b)
 // CHECK98-NEXT: unwind label %{{.+}}, !dbg ![[EXCEPTLOC:.*]]
@@ -122,6 +122,14 @@ int main(int argc, char **argv) {
 // CHECK-SAME:             ){{$}}
 
 // CHECK:      ![[INT:[0-9]+]] = !DIBasicType(name: "int"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_class_type, name: "B"
+// CHECKCOFF-NOT:              DIFlagFwdDecl
+// CHECKCOFF-SAME:             ){{$}}
+// CHECKCOFF: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
+// CHECKCOFF-SAME:           DIFlagArtificial
 
 // CHECK: [[C:![0-9]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "C",
 // CHECK-NOT:                              DIFlagFwdDecl
@@ -137,19 +145,19 @@ int main(int argc, char **argv) {
 // CHECK-SAME:                     DIFlagStaticMember
 // CHECK: [[C_DTOR]] = !DISubprogram(name: "~C"
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
-// CHECK-SAME:             identifier: "_ZTS1K"
-// CHECK-SAME:             ){{$}}
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
+// CHECKELF-SAME:             identifier: "_ZTS1K"
+// CHECKELF-SAME:             ){{$}}
 
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "B"
-// CHECK-NOT:              DIFlagFwdDecl
-// CHECK-SAME:             ){{$}}
-// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
-// CHECK-SAME:           DIFlagArtificial
+// CHECKELF: !DICompositeType(tag: DW_TAG_class_type, name: "B"
+// CHECKELF-NOT:              DIFlagFwdDecl
+// CHECKELF-SAME:             ){{$}}
+// CHECKELF: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
+// CHECKELF-SAME:           DIFlagArtificial
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
-// CHECK: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
+// CHECKELF: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
+// CHECKELF: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
 
 // CHECK: [[D:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "D"
 // CHECK-SAME:             size:
@@ -162,6 +170,10 @@ int main(int argc, char **argv) {
 // CHECK-NOT:              identifier:
 // CHECK-SAME:             ){{$}}
 
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
+// CHECKCOFF-SAME:             identifier: "_ZTS1K"
+// CHECKCOFF-SAME:             ){{$}}
+
 // CHECK: [[L:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "L"
 // CHECK-SAME:             ){{$}}
 // CHECK: [[L_FUNC_DECL:![0-9]*]] = !DISubprogram(name: "func",{{.*}} scope: [[L]]
diff --git a/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp b/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
index 9a8ce1997a7f..19c2a9bd0497 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fvisibility=hidden -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fapple-kext -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fno-assume-unique-vtables -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
+// RUN: %clang_cc1 -I%S %s -triple arm64e-apple-darwin10 -O1 -fptrauth-calls -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 
 struct A { virtual ~A(); };
 struct B final : A { };
diff --git a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
index 1b103719fbe4..e33525c1ec0f 100644
--- a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
@@ -78,6 +78,27 @@ struct authenticated(default_key, default_address_discrimination, custom_discrim
   virtual void g();
 };
 
+// CHECK: @_ZTVN5test19ConstEvalE = external unnamed_addr constant { [3 x ptr] }, align 8
+// CHECK: @_ZN5test12ceE = global %{{.*}} { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test19ConstEvalE, i32 0, i32 0, i32 2), i32 2, i64 0, ptr @_ZN5test12ceE) }, align 8
+// CHECK: @_ZTVN5test116ConstEvalDerivedE = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTIN5test116ConstEvalDerivedE, ptr ptrauth (ptr @_ZN5test19ConstEval1fEv, i32 0, i64 26259, ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5test116ConstEvalDerivedE, i32 0, i32 0, i32 2))] },{{.*}}align 8
+// CHECK: @_ZN5test13cedE = global { ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test116ConstEvalDerivedE, i32 0, i32 0, i32 2), i32 2, i64 0, ptr @_ZN5test13cedE) }, align 8
+
+struct authenticated(default_key, address_discrimination, no_extra_discrimination) ConstEval {
+  consteval ConstEval() {}
+  virtual void f();
+};
+
+// clang used to bail out with error message "could not emit constant value abstractly".
+ConstEval ce;
+
+struct ConstEvalDerived : public ConstEval {
+public:
+  consteval ConstEvalDerived() {}
+};
+
+// clang used to emit an undef initializer.
+ConstEvalDerived ced;
+
 template <typename T>
 struct SubClass : T {
   virtual void g();
diff --git a/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp b/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
index 249586f5991f..b24ece159832 100644
--- a/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
+++ b/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Simple inheritance case:
 // For CBase and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -30,13 +28,20 @@ int main() {
   return 0;
 }
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -S -g %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s --check-prefix=COFF
 
 // CHECK: $_ZTVN3NSP5CBaseE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
 // CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
 // CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// COFF: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// COFF-NOT: !dbg
+// COFF-SAME: {{$}}
+// COFF: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// COFF-NOT: !dbg
+// COFF-SAME: {{$}}
 
 // CHECK: [[BASE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-NEXT: [[BASE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN3NSP5CBaseE"
diff --git a/clang/test/CodeGenCoroutines/coro-gro.cpp b/clang/test/CodeGenCoroutines/coro-gro.cpp
index b62134317cef..037fd03349e7 100644
--- a/clang/test/CodeGenCoroutines/coro-gro.cpp
+++ b/clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -106,4 +106,31 @@ invoker g() {
   // CHECK: call void @_ZN7invoker15invoker_promise17get_return_objectEv({{.*}} %[[AggRes]]
   co_return;
 }
-// CHECK: ![[OutFrameMetadata]] = !{}
\ No newline at end of file
+
+namespace gh148953 {
+
+struct Task {
+  struct promise_type {
+    Task get_return_object();
+    std::suspend_always initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void return_void() {}
+    void unhandled_exception() {}
+  };
+  Task() {}
+  // Different from `invoker`, this Task is copy constructible.
+  Task(const Task&) {};
+};
+
+// NRVO on const qualified return type should work.
+// CHECK: define{{.*}} void @_ZN8gh1489537exampleEv({{.*}} sret(%"struct.gh148953::Task") align 1 %[[NrvoRes:.+]])
+const Task example() {
+  // CHECK: %[[ResultPtr:.+]] = alloca ptr
+  // CHECK: store ptr %[[NrvoRes]], ptr %[[ResultPtr]]
+  // CHECK: coro.init:
+  // CHECK: call void @_ZN8gh1489534Task12promise_type17get_return_objectEv({{.*}} %[[NrvoRes:.+]], {{.*}})
+  co_return;
+}
+
+} // namespace gh148953
+// CHECK: ![[OutFrameMetadata]] = !{}
diff --git a/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
new file mode 100644
index 000000000000..b51670fd6459
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -fptrauth-block-descriptor-pointers -triple arm64e-apple-ios  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -triple arm64e-apple-ios -DNO_BLOCK_DESC_AUTH -emit-llvm -o - %s | FileCheck %s --check-prefix=NODESCRIPTORAUTH
+
+#ifndef NO_BLOCK_DESC_AUTH
+_Static_assert(__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should set ptrauth_signed_block_descriptors");
+#else
+_Static_assert(!__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should not be enabled by default");
+#endif
+
+void a() {
+  // Test out a global block.
+  void (^blk)(void) = ^{};
+}
+
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr [[BLOCK_DESCRIPTOR_NAME]], i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
+
+// NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// NODESCRIPTORAUTH: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
+
+
+void b(int p) {
+  // CHECK-LABEL: define void @b
+
+  // Test out a stack block.
+  void (^blk)(void) = ^{(void)p;};
+
+  // CHECK: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // CHECK: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // CHECK: [[BLOCK_DESCRIPTOR_REF_INT:%.*]] = ptrtoint ptr [[BLOCK_DESCRIPTOR_REF]] to i64
+  // CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[BLOCK_DESCRIPTOR_REF_INT]], i64 49339)
+  // CHECK: [[SIGNED_REF:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @"__block_descriptor_36_e5_v8\01?0l" to i64), i32 2, i64 [[BLENDED]])
+  // CHECK: [[SIGNED_REF_PTR:%.*]] = inttoptr i64 [[SIGNED_REF]] to ptr
+  // CHECK: store ptr [[SIGNED_REF_PTR]], ptr [[BLOCK_DESCRIPTOR_REF]]
+
+  // NODESCRIPTORAUTH: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // NODESCRIPTORAUTH: store ptr @"__block_descriptor_36_e5_v8\01?0l", ptr [[BLOCK_DESCRIPTOR_REF]]
+}
diff --git a/clang/test/CodeGenObjC/ptrauth-block-isa.m b/clang/test/CodeGenObjC/ptrauth-block-isa.m
index c1e98c6fd9d3..248e57769ba1 100644
--- a/clang/test/CodeGenObjC/ptrauth-block-isa.m
+++ b/clang/test/CodeGenObjC/ptrauth-block-isa.m
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s  -o - | FileCheck %s
+// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s -o - | FileCheck %s
 
 void (^globalblock)(void) = ^{};
-// CHECK: [[GLOBAL_BLOCK:@.*]] = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr [[GLOBAL_BLOCK]]), i32 1342177280, i32 0, ptr @globalblock_block_invoke, ptr @"__block_descriptor_32_e5_v8\01?0l" }, align 8 #0
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }, comdat, align 8
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr @__block_literal_global), i32 1342177280, i32 0, ptr ptrauth (ptr @globalblock_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
 
 @interface A
 - (int) count;
diff --git a/clang/test/DebugInfo/KeyInstructions/asm.c b/clang/test/DebugInfo/KeyInstructions/asm.c
new file mode 100644
index 000000000000..2b3301660f7b
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/asm.c
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O0 -emit-llvm -x c %s -o - -gkey-instructions -debug-info-kind=line-tables-only -gno-column-info | FileCheck %s
+// Partially copied from clang/test/CodeGen/AArch64/ls64-inline-asm.c
+
+// Check the inline asm call and result store are Key and distinct atoms.
+
+struct foo { unsigned long long x[8]; };
+// CHECK-LABEL: define dso_local void @load(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG9:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG9]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1:[0-9]+]], !dbg [[DBG10:![0-9]+]], !srcloc [[META11:![0-9]+]]
+// CHECK-NEXT:    store i512 [[TMP2]], ptr [[TMP0]], align 8, !dbg [[DBG12:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG13:![0-9]+]]
+//
+void load(struct foo *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+
+// CHECK-LABEL: define dso_local void @load2(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0]] !dbg [[DBG14:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG15:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG15]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1]], !dbg [[DBG16:![0-9]+]], !srcloc [[META17:![0-9]+]]
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP0]], align 4, !dbg [[DBG18:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG19:![0-9]+]]
+//
+void load2(int *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+//.
+// CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+// CHECK: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+// CHECK: [[DBG5]] = distinct !DISubprogram(name: "load", scope: [[META6:![0-9]+]], file: [[META6]], line: 21, type: [[META7:![0-9]+]], scopeLine: 21, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[META6]] = !DIFile(filename: "{{.*}}asm.c", directory: {{.*}})
+// CHECK: [[META7]] = !DISubroutineType(types: [[META8:![0-9]+]])
+// CHECK: [[META8]] = !{}
+// CHECK: [[DBG9]] = !DILocation(line: 22, scope: [[DBG5]])
+// CHECK: [[DBG10]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META11]] = !{i64 1458}
+// CHECK: [[DBG12]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG13]] = !DILocation(line: 23, scope: [[DBG5]], atomGroup: 3, atomRank: 1)
+// CHECK: [[DBG14]] = distinct !DISubprogram(name: "load2", scope: [[META6]], file: [[META6]], line: 38, type: [[META7]], scopeLine: 38, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[DBG15]] = !DILocation(line: 39, scope: [[DBG14]])
+// CHECK: [[DBG16]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META17]] = !{i64 2501}
+// CHECK: [[DBG18]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG19]] = !DILocation(line: 40, scope: [[DBG14]], atomGroup: 3, atomRank: 1)
+//.
diff --git a/clang/test/DebugInfo/KeyInstructions/goto.c b/clang/test/DebugInfo/KeyInstructions/goto.c
new file mode 100644
index 000000000000..ead92e600ca5
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/goto.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c++ -std=c++17 %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+
+// Check the goto branches get Key Instructions metadata.
+void ext();
+void test_goto(void) {
+// CHECK: br label %dst1, !dbg [[G1R1:!.*]]
+  goto dst1;
+dst1:
+  ext();
+
+  void *ptr = &&dst2;
+// CHECK: br label %indirectgoto, !dbg [[G3R1:!.*]]
+  goto *ptr;
+dst2:
+  ext();
+
+// CHECK: br label %dst3, !dbg [[G4R1:!.*]]
+  goto *&&dst3;
+dst3:
+  ext();
+
+  return;
+}
+
+// CHECK: [[G1R1]] = !DILocation(line: 10, scope: ![[#]], atomGroup: 1, atomRank: 1)
+// CHECK: [[G3R1]] = !DILocation(line: 16, scope: ![[#]], atomGroup: 3, atomRank: 1)
+// CHECK: [[G4R1]] = !DILocation(line: 21, scope: ![[#]], atomGroup: 4, atomRank: 1)
diff --git a/clang/test/Driver/DTLTO/dtlto.c b/clang/test/Driver/DTLTO/dtlto.c
new file mode 100644
index 000000000000..96795d9a4e6a
--- /dev/null
+++ b/clang/test/Driver/DTLTO/dtlto.c
@@ -0,0 +1,48 @@
+// REQUIRES: lld
+
+/// Check DTLTO options are forwarded to the linker.
+
+/// Check that options are forwarded as expected with --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=FORWARD
+
+// FORWARD: ld.lld
+// FORWARD-SAME: "--thinlto-distributor=d.exe"
+// FORWARD-SAME: "--thinlto-remote-compiler={{[^"]+}}"
+// FORWARD-SAME: "--thinlto-distributor-arg=a1"
+// FORWARD-SAME: "--thinlto-distributor-arg=a2"
+// FORWARD-SAME: "--thinlto-distributor-arg=a3"
+
+/// Check that options are not added without --thinlto-distributor= and
+/// that a warning is issued for unused -Xthinlto-distributor options.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a1'
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a2,a3'
+// NODIST: ld.lld
+
+/// Check the expected arguments are forwarded by default with only
+/// --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=DEFAULT --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// DEFAULT: ld.lld
+// DEFAULT-SAME: "--thinlto-distributor=d.exe"
+// DEFAULT-SAME: "--thinlto-remote-compiler={{.*}}clang{{[^\"]*}}"
+
+/// Check that nothing is forwarded when the compiler is not in LTO mode, and that
+/// appropriate unused option warnings are issued.
+// RUN: %clang %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NOFLTO --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NOFLTO: warning: argument unused during compilation: '-fthinlto-distributor=d.exe'
+// NOFLTO: ld.lld
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index cfad4b8eb682..512b5a883cd5 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -11,7 +11,7 @@
 // LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// LLD-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -30,7 +30,7 @@
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // C-AARCH64-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// C-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// C-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -47,7 +47,7 @@
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64linux" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64elf" "-EL"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -67,7 +67,7 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // CXX-AARCH64-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -86,7 +86,7 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -105,7 +105,7 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -122,7 +122,7 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
diff --git a/clang/test/Driver/arch-arm64e.c b/clang/test/Driver/arch-arm64e.c
index 0fb12d4dcc5e..39006d203989 100644
--- a/clang/test/Driver/arch-arm64e.c
+++ b/clang/test/Driver/arch-arm64e.c
@@ -2,11 +2,20 @@
 
 // RUN: %clang -target arm64-apple-darwin -c %s -### 2>&1 | FileCheck %s --check-prefix NONE
 // NONE: "-cc1"
-// NONE-NOT: "-fptrauth-intrinsics"
+
 // NONE-NOT: "-fptrauth-calls"
 // NONE-NOT: "-fptrauth-returns"
+// NONE-NOT: "-fptrauth-intrinsics"
 // NONE-NOT: "-fptrauth-indirect-gotos"
 // NONE-NOT: "-fptrauth-auth-traps"
+// NONE-NOT: "-fptrauth-vtable-pointer-address-discrimination"
+// NONE-NOT: "-fptrauth-vtable-pointer-type-discrimination"
+// NONE-NOT: "-fptrauth-objc-isa"
+// NONE-NOT: "-fptrauth-objc-class-ro"
+// NONE-NOT: "-fptrauth-objc-interface-sel"
+
+// Final catch all if any new flags are added
+// NONE-NOT: "-fptrauth"
 
 // RUN: %clang -target arm64-apple-darwin -fptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix CALL
 // CALL: "-cc1"{{.*}} {{.*}} "-fptrauth-calls"
@@ -23,39 +32,39 @@
 // RUN: %clang -target arm64-apple-darwin -fptrauth-auth-traps -c %s -### 2>&1 | FileCheck %s --check-prefix TRAPS
 // TRAPS: "-cc1"{{.*}} {{.*}} "-fptrauth-auth-traps"
 
-
 // Check the arm64e defaults.
 
 // RUN: %clang -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
+// RUN: %clang -target arm64e-apple-macos -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULTMAC
+// RUN: %if system-darwin && target={{.*}}-{{darwin|macos}}{{.*}} %{ %clang -target arm64e-apple-macos -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULTARCH %}
 // RUN: %clang -mkernel -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
 // RUN: %clang -fapple-kext -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
-// DEFAULT: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULT: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULTMAC: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-m1"{{.*}}
+// DEFAULTARCH: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel"
 
 // RUN: %clang -target arm64e-apple-none-macho -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-MACHO
-// DEFAULT-MACHO: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULT-MACHO: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"{{.*}}
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
-// RUN: %clang -mkernel -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
-// RUN: %clang -fapple-kext -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
 // DEFAULT-NOCALL-NOT: "-fptrauth-calls"
-// DEFAULT-NOCALL: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"
+// DEFAULT-NOCALL: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-returns -c %s -### 2>&1 | FileCheck %s --check-prefix NORET
 
 // NORET-NOT: "-fptrauth-returns"
-// NORET: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"
+// NORET: "-fptrauth-calls" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-intrinsics -c %s -### 2>&1 | FileCheck %s --check-prefix NOINTRIN
 
-// NOINTRIN: "-fptrauth-returns"
 // NOINTRIN-NOT: "-fptrauth-intrinsics"
-// NOINTRIN: "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// NOINTRIN: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-auth-traps -c %s -### 2>&1 | FileCheck %s --check-prefix NOTRAP
-// NOTRAP: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" {{.*}}"-target-cpu" "apple-a12"
+// NOTRAP: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // Check the CPU defaults and overrides.
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index c367594b0a75..9005992f2b75 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -10,7 +10,7 @@
 // LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// LLD-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -29,7 +29,7 @@
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // C-ARM-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// C-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// C-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -46,7 +46,7 @@
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf" "-EL"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -67,7 +67,7 @@
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // CXX-ARM-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -87,7 +87,7 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -106,7 +106,7 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // CXX-ARM-BAREMETAL-LIBCXX: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -123,7 +123,7 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 4dc320191317..8b5ab4355087 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -17,7 +17,7 @@
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-V6M-C-NEXT: ld{{(.exe)?}}"
 // CHECK-V6M-C-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-C-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-C-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
@@ -43,7 +43,7 @@
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}"
-// CHECK-V6M-TREE-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-TREE-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME "{{.*}}.o"
@@ -60,7 +60,7 @@
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}"
 // CHECK-ARMV7M-PER-TARGET: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7M-PER-TARGET: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-ARMV7M-PER-TARGET: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
@@ -73,7 +73,7 @@
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}"
 // CHECK-V6M-DEFAULTCXX: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-DEFAULTCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-DEFAULTCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
@@ -90,7 +90,7 @@
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX: ld{{(.exe)?}}"
 // CHECK-V6M-LIBCXX-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-LIBCXX-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-LIBCXX-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
@@ -108,7 +108,7 @@
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
 // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}"
 // CHECK-V6M-LIBSTDCXX-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-LIBSTDCXX-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-LIBSTDCXX-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
@@ -123,7 +123,7 @@
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-NDL: ld{{(.exe)?}}"
 // CHECK-V6M-NDL: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-NDL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-NDL: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -171,7 +171,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
 // CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ARMV7EB: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7EB: "-Bstatic" "-m" "armelfb_linux_eabi" "--be8" "-EB"
+// CHECK-ARMV7EB: "-Bstatic" "-m" "armelfb" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -183,7 +183,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
 // CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ARMV7EL: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7EL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-ARMV7EL: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -196,7 +196,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
 // CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-AARCH64BE: sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-AARCH64BE: "-Bstatic" "-m" "aarch64linuxb" "-EB"
+// CHECK-AARCH64BE: "-Bstatic" "-m" "aarch64elfb" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -209,7 +209,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
 // CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-AARCH64LE: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-AARCH64LE: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CHECK-AARCH64LE: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -257,7 +257,7 @@
 // CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
-// CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -271,7 +271,7 @@
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV64-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -288,7 +288,7 @@
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
-// CHECK-RV64-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -305,7 +305,7 @@
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV64-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -325,7 +325,7 @@
 // CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
-// CHECK-RV32-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -339,7 +339,7 @@
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV32-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -355,7 +355,7 @@
 // CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -372,7 +372,7 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
diff --git a/clang/test/Driver/mingw-msvcrt.c b/clang/test/Driver/mingw-msvcrt.c
index 340ce1f57b0f..e1648630476a 100644
--- a/clang/test/Driver/mingw-msvcrt.c
+++ b/clang/test/Driver/mingw-msvcrt.c
@@ -7,10 +7,10 @@
 // CHECK_DEFAULT: "-lmingwex" "-lmsvcrt" "-ladvapi32"
 // CHECK_DEFAULT-SAME: "-lmsvcrt" "-lkernel32" "{{.*}}crtend.o"
 // CHECK_MSVCR120: "-lmsvcr120"
-// CHECK_MSVCR120-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_MSVCR120-SAME: "-lmingwex" "-lmsvcr120" "-ladvapi32"
 // CHECK_UCRTBASE: "-lucrtbase"
-// CHECK_UCRTBASE-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_UCRTBASE-SAME: "-lmingwex" "-lucrtbase" "-ladvapi32"
 // CHECK_UCRT: "-lucrt"
-// CHECK_UCRT-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_UCRT-SAME: "-lmingwex" "-lucrt" "-ladvapi32"
 // CHECK_CRTDLL: "-lcrtdll"
-// CHECK_CRTDLL-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_CRTDLL-SAME: "-lmingwex" "-lcrtdll" "-ladvapi32"
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 6639e9d2d9d6..1f12cfca9488 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -127,9 +127,12 @@
 // UNWIND-TABLES: "-funwind-tables=2"
 // NO-UNWIND-TABLES-NOT: "-funwind-tables=2"
 
-// Check that the -X and --no-relax flags are passed to the linker on riscv64
+// Check that the -X and --no-relax flags are passed to the linker
+// RUN: %clang --target=loongarch64-unknown-openbsd -mno-relax -### %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LA64-FLAGS %s
 // RUN: %clang --target=riscv64-unknown-openbsd -mno-relax -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=RISCV64-FLAGS %s
+// LA64-FLAGS: "-X" "--no-relax"
 // RISCV64-FLAGS: "-X" "--no-relax"
 
 // Check passing LTO flags to the linker
diff --git a/clang/test/Driver/sparc-target-features.c b/clang/test/Driver/sparc-target-features.c
index a839604ff1bc..bd17da112bbd 100644
--- a/clang/test/Driver/sparc-target-features.c
+++ b/clang/test/Driver/sparc-target-features.c
@@ -20,6 +20,11 @@
 
 // RUN: %clang --target=sparc -mvis2 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
 // RUN: %clang --target=sparc -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
+/// Solaris/SPARC defaults to -mvis2
+// RUN: %clang --target=sparc-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
+// RUN: %clang --target=sparcv9-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
+// RUN: %clang --target=sparcv9-sun-solaris2.11 -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
 // VIS2: "-target-feature" "+vis2"
 // NO-VIS2: "-target-feature" "-vis2"
 
@@ -34,4 +39,8 @@
 // SOFT-QUAD-FLOAT: "-target-feature" "-hard-quad-float"
 
 // RUN: %clang --target=sparc -mv8plus %s -### 2>&1 | FileCheck -check-prefix=V8PLUS %s
+/// 32-bit Solaris/SPARC defaults to -mv8plus
+// RUN: %clang --target=sparc-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=V8PLUS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -mno-v8plus %s -### 2>&1 | FileCheck -check-prefix=NO-V8PLUS %s
 // V8PLUS: "-target-feature" "+v8plus"
+// NO-V8PLUS-NOT: "-target-feature" "+v8plus"
diff --git a/clang/test/Modules/ExtDebugInfo.cpp b/clang/test/Modules/ExtDebugInfo.cpp
index 184973bc1783..3e74e2291d5e 100644
--- a/clang/test/Modules/ExtDebugInfo.cpp
+++ b/clang/test/Modules/ExtDebugInfo.cpp
@@ -8,7 +8,7 @@
 // RUN:     -fmodule-format=obj -fimplicit-module-maps -DMODULES \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t-mod.ll
-// RUN: cat %t-mod.ll |  FileCheck %s
+// RUN: cat %t-mod.ll |  FileCheck %s --check-prefix=CHECK %if target={{.*-(win|mingw|cyg).*}} %{--check-prefix=CHECKCOFF%} %else %{--check-prefix=CHECKELF%}
 
 // PCH:
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodule-format=obj -emit-pch -I%S/Inputs \
@@ -18,7 +18,7 @@
 // RUN:     -dwarf-ext-refs -fmodule-format=obj \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -include-pch %t.pch %s -emit-llvm -o %t-pch.ll
-// RUN: cat %t-pch.ll |  FileCheck %s
+// RUN: cat %t-pch.ll |  FileCheck %s --check-prefix=CHECK %if target={{.*-(win|mingw|cyg).*}} %{--check-prefix=CHECKCOFF%} %else %{--check-prefix=CHECKELF%}
 // RUN: cat %t-pch.ll |  FileCheck %s --check-prefix=CHECK-PCH
 
 #ifdef MODULES
@@ -208,9 +208,9 @@ void foo() {
 // CHECK-SAME:              name: "InAnonymousNamespace", {{.*}}DIFlagFwdDecl)
 
 // There is a full definition of the type available in the module.
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
-// CHECK-SAME:             DIFlagFwdDecl
-// CHECK-SAME:             identifier: "_ZTS7Virtual")
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
+// CHECKELF-SAME:             DIFlagFwdDecl
+// CHECKELF-SAME:             identifier: "_ZTS7Virtual")
 
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !{{[0-9]+}}, entity: ![[STRUCT]], file: ![[CPP]], line: 50)
 
@@ -222,3 +222,8 @@ void foo() {
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A",
 // CHECK-SAME:             DIFlagFwdDecl
+
+// There is a full definition of the type available in the module.
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
+// CHECKCOFF-SAME:             DIFlagFwdDecl
+// CHECKCOFF-SAME:             identifier: "_ZTS7Virtual")
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index 4fead33bd826..125872a001ba 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -80,10 +80,10 @@
 // MIPS32BE:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INTMAX_TYPE__ long long int
 // MIPS32BE:#define __INTMAX_WIDTH__ 64
-// MIPS32BE:#define __INTPTR_FMTd__ "ld"
-// MIPS32BE:#define __INTPTR_FMTi__ "li"
-// MIPS32BE:#define __INTPTR_MAX__ 2147483647L
-// MIPS32BE:#define __INTPTR_TYPE__ long int
+// MIPS32BE:#define __INTPTR_FMTd__ "d"
+// MIPS32BE:#define __INTPTR_FMTi__ "i"
+// MIPS32BE:#define __INTPTR_MAX__ 2147483647
+// MIPS32BE:#define __INTPTR_TYPE__ int
 // MIPS32BE:#define __INTPTR_WIDTH__ 32
 // MIPS32BE:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32BE:#define __INT_FAST16_FMTi__ "hi"
@@ -185,8 +185,8 @@
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINTMAX_WIDTH__ 64
-// MIPS32BE:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32BE:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32BE:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32BE:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32BE:#define __UINTPTR_WIDTH__ 32
 // MIPS32BE:#define __UINT_FAST16_MAX__ 65535
 // MIPS32BE:#define __UINT_FAST16_TYPE__ unsigned short
@@ -300,10 +300,10 @@
 // MIPS32EL:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INTMAX_TYPE__ long long int
 // MIPS32EL:#define __INTMAX_WIDTH__ 64
-// MIPS32EL:#define __INTPTR_FMTd__ "ld"
-// MIPS32EL:#define __INTPTR_FMTi__ "li"
-// MIPS32EL:#define __INTPTR_MAX__ 2147483647L
-// MIPS32EL:#define __INTPTR_TYPE__ long int
+// MIPS32EL:#define __INTPTR_FMTd__ "d"
+// MIPS32EL:#define __INTPTR_FMTi__ "i"
+// MIPS32EL:#define __INTPTR_MAX__ 2147483647
+// MIPS32EL:#define __INTPTR_TYPE__ int
 // MIPS32EL:#define __INTPTR_WIDTH__ 32
 // MIPS32EL:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32EL:#define __INT_FAST16_FMTi__ "hi"
@@ -402,8 +402,8 @@
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINTMAX_WIDTH__ 64
-// MIPS32EL:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32EL:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32EL:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32EL:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32EL:#define __UINTPTR_WIDTH__ 32
 // MIPS32EL:#define __UINT_FAST16_MAX__ 65535
 // MIPS32EL:#define __UINT_FAST16_TYPE__ unsigned short
@@ -547,10 +547,10 @@
 // MIPSN32BE: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32BE: #define __INTMAX_TYPE__ long long int
 // MIPSN32BE: #define __INTMAX_WIDTH__ 64
-// MIPSN32BE: #define __INTPTR_FMTd__ "ld"
-// MIPSN32BE: #define __INTPTR_FMTi__ "li"
-// MIPSN32BE: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32BE: #define __INTPTR_TYPE__ long int
+// MIPSN32BE: #define __INTPTR_FMTd__ "d"
+// MIPSN32BE: #define __INTPTR_FMTi__ "i"
+// MIPSN32BE: #define __INTPTR_MAX__ 2147483647
+// MIPSN32BE: #define __INTPTR_TYPE__ int
 // MIPSN32BE: #define __INTPTR_WIDTH__ 32
 // MIPSN32BE: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32BE: #define __INT_FAST16_FMTi__ "hi"
@@ -684,12 +684,12 @@
 // MIPSN32BE: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32BE: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32BE: #define __UINTMAX_WIDTH__ 64
-// MIPSN32BE: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32BE: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32BE: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32BE: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32BE: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32BE: #define __UINTPTR_FMTX__ "X"
+// MIPSN32BE: #define __UINTPTR_FMTo__ "o"
+// MIPSN32BE: #define __UINTPTR_FMTu__ "u"
+// MIPSN32BE: #define __UINTPTR_FMTx__ "x"
+// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32BE: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32BE: #define __UINTPTR_WIDTH__ 32
 // MIPSN32BE: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32BE: #define __UINT_FAST16_FMTo__ "ho"
@@ -864,10 +864,10 @@
 // MIPSN32EL: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32EL: #define __INTMAX_TYPE__ long long int
 // MIPSN32EL: #define __INTMAX_WIDTH__ 64
-// MIPSN32EL: #define __INTPTR_FMTd__ "ld"
-// MIPSN32EL: #define __INTPTR_FMTi__ "li"
-// MIPSN32EL: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32EL: #define __INTPTR_TYPE__ long int
+// MIPSN32EL: #define __INTPTR_FMTd__ "d"
+// MIPSN32EL: #define __INTPTR_FMTi__ "i"
+// MIPSN32EL: #define __INTPTR_MAX__ 2147483647
+// MIPSN32EL: #define __INTPTR_TYPE__ int
 // MIPSN32EL: #define __INTPTR_WIDTH__ 32
 // MIPSN32EL: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32EL: #define __INT_FAST16_FMTi__ "hi"
@@ -1001,12 +1001,12 @@
 // MIPSN32EL: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32EL: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32EL: #define __UINTMAX_WIDTH__ 64
-// MIPSN32EL: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32EL: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32EL: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32EL: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32EL: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32EL: #define __UINTPTR_FMTX__ "X"
+// MIPSN32EL: #define __UINTPTR_FMTo__ "o"
+// MIPSN32EL: #define __UINTPTR_FMTu__ "u"
+// MIPSN32EL: #define __UINTPTR_FMTx__ "x"
+// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32EL: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32EL: #define __UINTPTR_WIDTH__ 32
 // MIPSN32EL: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32EL: #define __UINT_FAST16_FMTo__ "ho"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index bed39dc3e34d..7e0df9614136 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,6 +1622,14 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
+// RUN: %clang_cc1 -x c -std=c11 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// RUN: %clang_cc1 -x c -std=gnu11 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// RUN: %clang_cc1 -x c -std=c17 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// DRAGONFLY-STDC:#define __STDC_NO_THREADS__ 1
+//
+// RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC-N %s
+// DRAGONFLY-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
+//
 // RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
 // RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
 // RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
diff --git a/clang/test/Preprocessor/ptrauth_extension.c b/clang/test/Preprocessor/ptrauth_extension.c
index d6b79187ba62..3267b0786c28 100644
--- a/clang/test/Preprocessor/ptrauth_extension.c
+++ b/clang/test/Preprocessor/ptrauth_extension.c
@@ -4,10 +4,32 @@
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \
 // RUN:   FileCheck %s --check-prefixes=NOINTRIN
 
-#if __has_extension(ptrauth_qualifier)
-// INTRIN: has_ptrauth_qualifier
-void has_ptrauth_qualifier() {}
-#else
+// RUN: %clang_cc1 -E %s -DIS_DARWIN -triple=arm64e-apple-darwin -fptrauth-intrinsics | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,INTRIN_MAC
+
+// RUN: %clang_cc1 -E %s -DIS_DARWIN -triple=arm64e-apple-darwin -fptrauth-calls | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN
+
+#if defined(IS_DARWIN) && __has_extension(ptrauth_qualifier)
+// INTRIN_MAC: has_ptrauth_qualifier1
+void has_ptrauth_qualifier1() {}
+#ifndef __PTRAUTH__
+#error ptrauth_qualifier extension present without predefined test macro
+#endif
+#endif
+#if defined(IS_DARWIN) && __has_feature(ptrauth_qualifier)
+// INTRIN_MAC: has_ptrauth_qualifier2
+void has_ptrauth_qualifier2() {}
+#ifndef __PTRAUTH__
+#error ptrauth_qualifier extension present without predefined test macro
+#endif
+#endif
+#if defined(__PTRAUTH__)
+// INTRIN: has_ptrauth_qualifier3
+void has_ptrauth_qualifier3() {}
+#endif
+
+#if !defined(__PTRAUTH__) && !__has_feature(ptrauth_qualifier) && !__has_extension(ptrauth_qualifier)
 // NOINTRIN: no_ptrauth_qualifier
 void no_ptrauth_qualifier() {}
 #endif
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index a440791d6cc6..cebea4188415 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -34,7 +34,7 @@
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-elf-got | \
 // RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,ELFGOT
 
-#if __has_feature(ptrauth_intrinsics)
+#if defined(__PTRAUTH__)
 // INTRIN: has_ptrauth_intrinsics
 void has_ptrauth_intrinsics() {}
 #else
@@ -130,3 +130,11 @@ void has_ptrauth_elf_got() {}
 // NOELFGOT: no_ptrauth_elf_got
 void no_ptrauth_elf_got() {}
 #endif
+
+#if __has_feature(ptrauth_objc_signable_class)
+// INTRIN: has_ptrauth_objc_signable_class
+void has_ptrauth_objc_signable_class(){}
+#else
+// NOINTRIN: no_ptrauth_objc_signable_class
+void no_ptrauth_objc_signable_class(){}
+#endif
diff --git a/clang/test/Preprocessor/stdint.c b/clang/test/Preprocessor/stdint.c
index 899ff59bf0b6..9f982a3a94fd 100644
--- a/clang/test/Preprocessor/stdint.c
+++ b/clang/test/Preprocessor/stdint.c
@@ -350,8 +350,8 @@
 // MIPS:typedef int8_t int_fast8_t;
 // MIPS:typedef uint8_t uint_fast8_t;
 //
-// MIPS:typedef long int intptr_t;
-// MIPS:typedef long unsigned int uintptr_t;
+// MIPS:typedef int intptr_t;
+// MIPS:typedef unsigned int uintptr_t;
 //
 // MIPS:typedef long long int intmax_t;
 // MIPS:typedef long long unsigned int uintmax_t;
@@ -396,9 +396,9 @@
 // MIPS:INT_FAST64_MAX_ 9223372036854775807LL
 // MIPS:UINT_FAST64_MAX_ 18446744073709551615ULL
 //
-// MIPS:INTPTR_MIN_ (-2147483647L -1)
-// MIPS:INTPTR_MAX_ 2147483647L
-// MIPS:UINTPTR_MAX_ 4294967295UL
+// MIPS:INTPTR_MIN_ (-2147483647 -1)
+// MIPS:INTPTR_MAX_ 2147483647
+// MIPS:UINTPTR_MAX_ 4294967295U
 // MIPS:PTRDIFF_MIN_ (-2147483647 -1)
 // MIPS:PTRDIFF_MAX_ 2147483647
 // MIPS:SIZE_MAX_ 4294967295U
diff --git a/clang/test/Sema/attr-nonstring.c b/clang/test/Sema/attr-nonstring.c
index 3838aa3bbee1..fe7b6d259dd7 100644
--- a/clang/test/Sema/attr-nonstring.c
+++ b/clang/test/Sema/attr-nonstring.c
@@ -229,3 +229,11 @@ struct Outer o2[] = {
     }
   }
 };
+
+// The attribute also works with a pointer type, not just an array type.
+__attribute__((nonstring)) char *ptr1;
+__attribute__((nonstring)) const unsigned char *ptr2;
+struct GH150951 {
+  __attribute__((nonstring)) char *ptr1;
+  __attribute__((nonstring)) const unsigned char *ptr2;
+};
diff --git a/clang/test/Sema/for.c b/clang/test/Sema/for.c
index e16169aac0c4..35c4720ef330 100644
--- a/clang/test/Sema/for.c
+++ b/clang/test/Sema/for.c
@@ -26,6 +26,5 @@ void b11 (void) { for (static _Thread_local struct { int i; } s;s.i;); } /* c11-
 #endif
 
 void b12(void) {
-  for(_Static_assert(1, "");;) {} /* c11-warning {{non-variable declaration in 'for' loop is a C23 extension}}
-                                     c23-warning {{non-variable declaration in 'for' loop is incompatible with C standards before C23}} */
+  for(_Static_assert(1, "");;) {} /* okay, _Static_assert declares *no* identifiers */
 }
diff --git a/clang/test/Sema/implicit-void-ptr-cast.c b/clang/test/Sema/implicit-void-ptr-cast.c
index 3c3e153d1dbd..d5e949719144 100644
--- a/clang/test/Sema/implicit-void-ptr-cast.c
+++ b/clang/test/Sema/implicit-void-ptr-cast.c
@@ -82,3 +82,15 @@ void more(void) {
   ptr3 = SOMETHING_THAT_IS_NOT_NULL; // c-warning {{implicit conversion when assigning to 'char *' from type 'void *' is not permitted in C++}} \
                                         cxx-error {{assigning to 'char *' from incompatible type 'void *'}}
 }
+
+void gh154157(void) {
+  #define ATOMIC_VAR_INIT(value) (value)
+
+  typedef const struct T * T_Ref;
+  static T_Ref _Atomic x = ATOMIC_VAR_INIT((void*)NULL); // c-warning {{implicit conversion when initializing '_Atomic(T_Ref)' with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type '_Atomic(T_Ref)' with an rvalue of type 'void *'}}
+  static T_Ref const y = ATOMIC_VAR_INIT((void*)NULL);   // c-warning {{implicit conversion when initializing 'const T_Ref' (aka 'const struct T *const') with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type 'const T_Ref' (aka 'const T *const') with an rvalue of type 'void *'}}
+  static T_Ref z = ATOMIC_VAR_INIT((void*)NULL);         // c-warning {{implicit conversion when initializing 'T_Ref' (aka 'const struct T *') with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type 'T_Ref' (aka 'const T *') with an rvalue of type 'void *'}}
+}
diff --git a/clang/test/Sema/ptrauth-qualifier.c b/clang/test/Sema/ptrauth-qualifier.c
index 5d932b724f07..3e568ce9f37e 100644
--- a/clang/test/Sema/ptrauth-qualifier.c
+++ b/clang/test/Sema/ptrauth-qualifier.c
@@ -1,13 +1,25 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -DIS_DARWIN -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
 
-#if !__has_extension(ptrauth_qualifier)
+#if defined(IS_DARWIN) && !__has_extension(ptrauth_qualifier)
 // This error means that the __ptrauth qualifier availability test says  that it
 // is not available. This error is not expected in the output, if it is seen
 // there is a feature detection regression.
 #error __ptrauth qualifier not enabled
 #endif
 
+#if defined(IS_DARWIN) && !__has_feature(ptrauth_qualifier)
+// This error means that the __has_feature test for ptrauth_qualifier has
+// failed, despite it being expected on darwin.
+#error __ptrauth qualifier not enabled
+#elif !defined(IS_DARWIN) && (__has_feature(ptrauth_qualifier) || __has_extension(ptrauth_qualifier))
+#error ptrauth_qualifier labeled a feature on a non-darwin platform
+#endif
+
+#if !defined (__PTRAUTH__)
+#error __PTRAUTH__ test macro not defined when ptrauth is enabled
+#endif
+
 #if __aarch64__
 #define VALID_CODE_KEY 0
 #define VALID_DATA_KEY 2
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
index 38dfdb98f08f..a956386ae933 100644
--- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-lifetime-safety -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 struct MyObj {
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index c390fee1c38d..935409131e18 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -2615,3 +2615,19 @@ namespace DoubleCapture {
     };
   }
 }
+
+namespace GH154567 {
+  struct T {
+    int i;
+  };
+
+  struct S {
+    struct { // expected-warning {{GNU extension}}
+      T val;
+    };
+    constexpr S() : val() {}
+  };
+
+  constexpr S s{};
+  static_assert(s.val.i == 0, "");
+}
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index e16a69df3830..e93b98c185a8 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1321,3 +1321,18 @@ constexpr bool check = different_in_loop();
   // expected-error@-1 {{}} expected-note@-1 {{in call}}
 
 }
+
+namespace comparison_dead_variable {
+  constexpr bool f() {
+    int *p1 = 0, *p2 = 0;
+    {
+        int x = 0; p1 = &x;
+    }
+    {
+        int x = 0; p2 = &x;
+    }
+    return p1 != p2;
+  }
+  // FIXME: This should fail.
+  static_assert(f(),"");
+}
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index dffb386f530f..312a77830420 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -319,7 +319,7 @@ namespace casting {
 }
 
 namespace pointer_comparisons {
-  extern int &extern_n; // interpreter-note 2 {{declared here}}
+  extern int &extern_n; // interpreter-note 4 {{declared here}}
   extern int &extern_n2;
   constexpr int f1(bool b, int& n) {
     if (b) {
@@ -330,14 +330,70 @@ namespace pointer_comparisons {
   // FIXME: interpreter incorrectly rejects; both sides are the same constexpr-unknown value.
   static_assert(f1(false, extern_n)); // interpreter-error {{static assertion expression is not an integral constant expression}} \
                                       // interpreter-note {{initializer of 'extern_n' is unknown}}
-  // FIXME: We should diagnose this: we don't know if the references bind
-  // to the same object.
-  static_assert(&extern_n != &extern_n2); // interpreter-error {{static assertion expression is not an integral constant expression}} \
+  static_assert(&extern_n != &extern_n2); // expected-error {{static assertion expression is not an integral constant expression}} \
+                                          // nointerpreter-note {{comparison between pointers to unrelated objects '&extern_n' and '&extern_n2' has unspecified value}} \
                                           // interpreter-note {{initializer of 'extern_n' is unknown}}
   void f2(const int &n) {
-    // FIXME: We should not diagnose this: the two objects provably have
-    // different addresses because the lifetime of "n" extends across
-    // the initialization.
-    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}}
+    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}} \
+                                // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&n' has unspecified value}}
+    // Distinct variables are not equal, even if they're local variables.
+    constexpr int y = &x == &y;
+    static_assert(!y);
   }
+  constexpr int f3() {
+    int x;
+    return &x == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&extern_n' has unspecified value}} \
+                            // interpreter-note {{initializer of 'extern_n' is unknown}}
+  }
+  static_assert(!f3()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f3()'}}
+  constexpr int f4() {
+    int *p = new int;
+    bool b = p == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&{*new int#0}' and '&extern_n' has unspecified value}} \
+                             // interpreter-note {{initializer of 'extern_n' is unknown}}
+    delete p;
+    return b;
+  }
+  static_assert(!f4()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f4()'}}
+}
+
+namespace GH149188 {
+namespace enable_if_1 {
+  template <__SIZE_TYPE__ N>
+  constexpr void foo(const char (&Str)[N])
+  __attribute((enable_if(__builtin_strlen(Str), ""))) {}
+
+  void x() {
+      foo("1234");
+  }
+}
+
+namespace enable_if_2 {
+  constexpr const char (&f())[];
+  extern const char (&Str)[];
+  constexpr int foo()
+  __attribute((enable_if(__builtin_strlen(Str), "")))
+  {return __builtin_strlen(Str);}
+
+  constexpr const char (&f())[] {return "a";}
+  constexpr const char (&Str)[] = f();
+  void x() {
+      constexpr int x = foo();
+  }
+}
+}
+
+namespace GH150015 {
+  extern int (& c)[8]; // interpreter-note {{declared here}}
+  constexpr int x = c <= c+8; // interpreter-error {{constexpr variable 'x' must be initialized by a constant expression}} \
+                              // interpreter-note {{initializer of 'c' is unknown}}
+
+  struct X {};
+  struct Y {};
+  struct Z : X, Y {};
+  extern Z &z; // interpreter-note{{declared here}}
+  constexpr int bases = (void*)(X*)&z <= (Y*)&z; // expected-error {{constexpr variable 'bases' must be initialized by a constant expression}} \
+                                                 // nointerpreter-note {{comparison of addresses of subobjects of different base classes has unspecified value}} \
+                                                 // interpreter-note {{initializer of 'z' is unknown}}
 }
diff --git a/clang/test/SemaCXX/constexpr-never-constant.cpp b/clang/test/SemaCXX/constexpr-never-constant.cpp
index 307810ee263d..5756bb647ce8 100644
--- a/clang/test/SemaCXX/constexpr-never-constant.cpp
+++ b/clang/test/SemaCXX/constexpr-never-constant.cpp
@@ -24,3 +24,10 @@ constexpr void other_func() {
 
   throw 12;
 }
+
+namespace GH149041 {
+  // Make sure these don't trigger the diagnostic.
+  extern const bool& b;
+  constexpr bool fun1() { return b; }
+  constexpr bool fun2(const bool& b) { return b; }
+}
diff --git a/clang/test/SemaCXX/noreturn-weverything.c b/clang/test/SemaCXX/noreturn-weverything.c
new file mode 100644
index 000000000000..92a587d39563
--- /dev/null
+++ b/clang/test/SemaCXX/noreturn-weverything.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Weverything
+
+void free(void *);
+typedef void (*set_free_func)(void *);
+struct Method {
+  int nparams;
+  int *param;
+};
+void selelem_free_method(struct Method* method, void* data) {
+    set_free_func free_func = 0;
+    for (int i = 0; i < method->nparams; ++i)
+        free(&method->param[i]);
+    if (data && free_func)
+        free_func(data);
+}
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 135865c8450f..46c367084852 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -283,31 +283,3 @@ void f() {
 }
 
 #endif
-
-namespace GH147374 {
-
-struct String {};
-template <typename T> void operator+(T, String &&) = delete;
-
-struct Bar {
-    void operator+(String) const; // expected-note {{candidate function}}
-    friend void operator+(Bar, String) {};  // expected-note {{candidate function}}
-};
-
-struct Baz {
-    void operator+(String); // expected-note {{candidate function}}
-    friend void operator+(Baz, String) {}; // expected-note {{candidate function}}
-};
-
-void test() {
-    Bar a;
-    String b;
-    a + b;
-    //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Bar' and 'String')}}
-
-    Baz z;
-    z + b;
-    //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Baz' and 'String')}}
-}
-
-}
diff --git a/clang/test/SemaCXX/warn-thread-safety-negative.cpp b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
index 9eabd67e4fc7..0caf6d6139e5 100644
--- a/clang/test/SemaCXX/warn-thread-safety-negative.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
@@ -21,6 +21,15 @@ class LOCKABLE Mutex {
   void AssertReaderHeld() ASSERT_SHARED_LOCK();
 };
 
+class LOCKABLE REENTRANT_CAPABILITY ReentrantMutex {
+public:
+  void Lock() EXCLUSIVE_LOCK_FUNCTION();
+  void Unlock() UNLOCK_FUNCTION();
+
+  // for negative capabilities
+  const ReentrantMutex& operator!() const { return *this; }
+};
+
 class SCOPED_LOCKABLE MutexLock {
 public:
   MutexLock(Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu);
@@ -89,6 +98,29 @@ class Foo {
   }
 };
 
+class Reentrant {
+  ReentrantMutex mu;
+
+public:
+  void acquire() {
+    mu.Lock();   // no warning -- reentrant mutex
+    mu.Unlock();
+  }
+
+  void requireNegative() EXCLUSIVE_LOCKS_REQUIRED(!mu) { // warning?
+    mu.Lock();
+    mu.Unlock();
+  }
+
+  void callRequireNegative() {
+    requireNegative(); // expected-warning{{calling function 'requireNegative' requires negative capability '!mu'}}
+  }
+
+  void callHaveNegative() EXCLUSIVE_LOCKS_REQUIRED(!mu) {
+    requireNegative();
+  }
+};
+
 }  // end namespace SimpleTest
 
 Mutex globalMutex;
diff --git a/clang/test/SemaCXX/wreturn-always-throws.cpp b/clang/test/SemaCXX/wreturn-always-throws.cpp
index addcadd1183d..df7689f7063c 100644
--- a/clang/test/SemaCXX/wreturn-always-throws.cpp
+++ b/clang/test/SemaCXX/wreturn-always-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type -Winvalid-noreturn -verify %s
 // expected-no-diagnostics
 
 namespace std {
@@ -44,3 +44,22 @@ void testTemplates() {
   throwErrorTemplate("ERROR");
   (void)ensureZeroTemplate(42);
 }
+
+// Ensure that explicit specialization of a member function does not inherit
+// the warning from the primary template.
+
+template<typename T>
+struct S {
+  void f();
+  void g();
+};
+
+template<typename T>
+void S<T>::f() { throw 0; } 
+template<>
+void S<int>::f() {}
+
+template<typename T> 
+void S<T>::g() {}  
+template<> 
+void S<int>::g() { throw 0; }
diff --git a/clang/test/SemaObjC/ptrauth-qualifier.m b/clang/test/SemaObjC/ptrauth-qualifier.m
index 74bbe6f09899..67a73bbe4577 100644
--- a/clang/test/SemaObjC/ptrauth-qualifier.m
+++ b/clang/test/SemaObjC/ptrauth-qualifier.m
@@ -1,13 +1,25 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -DIS_DARWIN -fsyntax-only -verify -fptrauth-intrinsics %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics %s
 
-#if !__has_extension(ptrauth_qualifier)
+#if defined(IS_DARWIN) && !__has_extension(ptrauth_qualifier)
 // This error means that the __ptrauth qualifier availability test says  that it
 // is not available. This error is not expected in the output, if it is seen
 // there is a feature detection regression.
 #error __ptrauth qualifier not enabled
 #endif
 
+#if defined(IS_DARWIN) && !__has_feature(ptrauth_qualifier)
+// This error means that the __has_feature test for ptrauth_qualifier has
+// failed, despite it being expected on darwin.
+#error __ptrauth qualifier not enabled
+#elif !defined(IS_DARWIN) && (__has_feature(ptrauth_qualifier) || __has_extension(ptrauth_qualifier))
+#error ptrauth_qualifier labeled a feature on a non-darwin platform
+#endif
+
+#if !defined (__PTRAUTH__)
+#error __PTRAUTH__ test macro not defined when ptrauth is enabled
+#endif
+
 @interface Foo
 // expected-warning@-1 {{class 'Foo' defined without specifying a base class}}
 // expected-note@-2 {{add a super class to fix this problem}}
diff --git a/clang/test/SemaObjCXX/discarded-block-type-inference.mm b/clang/test/SemaObjCXX/discarded-block-type-inference.mm
new file mode 100644
index 000000000000..8e2587724a7f
--- /dev/null
+++ b/clang/test/SemaObjCXX/discarded-block-type-inference.mm
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fobjc-arc -fblocks %s
+
+void  block_receiver(int (^)() );
+
+int f1() {
+  if constexpr (0)
+    (block_receiver)(^{ return 2; });
+  return 1;
+}
+
+int f2() {
+  if constexpr (0)
+    return (^{ return 2; })();
+  return 1;
+}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 62a4f95d79c7..6d6052791507 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1228,25 +1228,25 @@ template <KnownKind T> struct KnownType {
 
 }
 
-namespace GH115838 {
+namespace CWG2369_Regression_2 {
 
-template<typename T> concept has_x = requires(T t) {{ t.x };};
-
-class Publ { public:    int x = 0; };
-class Priv { private:   int x = 0; };
-class Prot { protected: int x = 0; };
-class Same { protected: int x = 0; };
-
-template<typename T> class D;
-template<typename T> requires ( has_x<T>) class D<T>: public T { public: static constexpr bool has = 1; };
-template<typename T> requires (!has_x<T>) class D<T>: public T { public: static constexpr bool has = 0; };
+template <typename T>
+concept HasFastPropertyForAttribute =
+    requires(T element, int name) { element.propertyForAttribute(name); };
+
+template <typename OwnerType>
+struct SVGPropertyOwnerRegistry {
+  static int fastAnimatedPropertyLookup() {
+    static_assert (HasFastPropertyForAttribute<OwnerType>);
+    return 1;
+  }
+};
 
-// "Same" is identical to "Prot" but queried before used.
-static_assert(!has_x<Same>,  "Protected should be invisible.");
-static_assert(!D<Same>::has, "Protected should be invisible.");
+class SVGCircleElement {
+  friend SVGPropertyOwnerRegistry<SVGCircleElement>;
+  void propertyForAttribute(int);
+};
 
-static_assert( D<Publ>::has, "Public should be visible.");
-static_assert(!D<Priv>::has, "Private should be invisible.");
-static_assert(!D<Prot>::has, "Protected should be invisible.");
+int i = SVGPropertyOwnerRegistry<SVGCircleElement>::fastAnimatedPropertyLookup();
 
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 0953f647426f..f6bc6ee3673c 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -966,3 +966,19 @@ Expand<Type, Invocable<>> _{};
 // CHECK-NEXT:   | `-ParmVarDecl {{.+}} 'T...' pack
 
 }
+
+namespace GH134613 {
+template <typename R> struct Foo {
+  using value_type = R;
+
+  Foo() = default;
+  Foo(Foo<Foo<R>> &&rhs) {}
+};
+
+void main() {
+  auto r1 = Foo(Foo<Foo<int>>{});
+
+  static_assert(__is_same(decltype(r1)::value_type, int));
+}
+
+}
diff --git a/clang/test/bindings/python/bindings.sh b/clang/test/bindings/python/bindings.sh
deleted file mode 100755
index 3f7a51ef9ca4..000000000000
--- a/clang/test/bindings/python/bindings.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/sh
-
-# UNSUPPORTED: !libclang-loadable
-
-# Tests fail on Windows, and need someone knowledgeable to fix.
-# It's not clear whether it's a test or a valid binding problem.
-# XFAIL: target={{.*windows.*}}
-
-# The Python FFI interface is broken on AIX: https://bugs.python.org/issue38628.
-# XFAIL: target={{.*-aix.*}}
-
-# Hexagon has known test failures that need to be addressed.
-# https://reviews.llvm.org/D52840#1265716
-# XFAIL: target={{hexagon-.*}}
-# python SEGVs on Linux/sparc64 when loading libclang.so.  Seems to be an FFI
-# issue, too.
-# XFAIL: target={{sparc.*-.*-linux.*}}
-
-# Tests will fail if cross-compiling for a different target, as tests will try
-# to use the host Python3_EXECUTABLE and make FFI calls to functions in target
-# libraries.
-#
-# FIXME: Consider a solution that allows better control over these tests in
-# a crosscompiling scenario. e.g. registering them with lit to allow them to
-# be explicitly skipped via appropriate LIT_ARGS, or adding a mechanism to
-# allow specifying a python interpreter compiled for the target that could
-# be executed using qemu-user.
-# REQUIRES: native
-
-# SystemZ has broken Python/FFI interface
-# according to https://reviews.llvm.org/D52840#1265716
-# This leads to failures only when Clang is built with GCC apparently, see:
-# https://github.com/llvm/llvm-project/pull/146844#issuecomment-3048291798
-# REQUIRES: !target={{s390x-.*}}
-
-# RUN: env PYTHONPATH=%S/../../../bindings/python \
-# RUN:   CLANG_LIBRARY_PATH=%libdir \
-# RUN:   %python -m unittest discover -s %S/tests
diff --git a/clang/test/bindings/python/lit.local.cfg b/clang/test/bindings/python/lit.local.cfg
deleted file mode 100644
index cc3bdf8ba97d..000000000000
--- a/clang/test/bindings/python/lit.local.cfg
+++ /dev/null
@@ -1,41 +0,0 @@
-def is_libclang_loadable():
-    # Do not try to run if libclang was built with sanitizers because
-    # the sanitizer library will likely be loaded too late to perform
-    # interception and will then fail.
-    # We could use LD_PRELOAD/DYLD_INSERT_LIBRARIES but this isn't
-    # portable so its easier just to not run the tests when building
-    # with ASan.
-    if config.llvm_use_sanitizer != "":
-        return False
-    try:
-        sys.path.append(os.path.join(config.clang_src_dir, "bindings/python"))
-        from clang.cindex import Config
-        conf = Config()
-        Config.set_library_path(config.clang_lib_dir)
-        conf.lib
-        return True
-    except Exception as e:
-        # Expected failure modes are considered benign when nothing can be
-        # done about them.
-        #
-        # Cannot load a 32-bit libclang.so into a 64-bit python.
-        if "wrong ELF class: ELFCLASS32" in str(e):
-            return False
-        # If libclang.so is missing, it must have been disabled intentionally,
-        # e.g. by building with LLVM_ENABLE_PIC=OFF.
-        elif "No such file or directory" in str(e):
-            return False
-        # Unexpected failure modes need to be investigated to either fix an
-        # underlying bug or accept the failure, so return True.  This causes
-        # tests to run and FAIL, drawing developer attention.
-        else:
-            print("warning: unhandled failure in is_libclang_loadable: "
-                  + str(e), file=sys.stderr)
-            return True
-
-if is_libclang_loadable():
-    config.available_features.add("libclang-loadable")
-
-config.substitutions.append(('%libdir', config.clang_lib_dir))
-
-config.suffixes = ['.sh']
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index b6662b66206b..2b1e266f0739 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SOURCES
   Indexing.cpp
   FatalErrorHandler.cpp
   Rewrite.cpp
+  Obsolete.cpp
 
   ADDITIONAL_HEADERS
   CIndexDiagnostic.h
diff --git a/clang/tools/libclang/Obsolete.cpp b/clang/tools/libclang/Obsolete.cpp
new file mode 100644
index 000000000000..3596f76e1be6
--- /dev/null
+++ b/clang/tools/libclang/Obsolete.cpp
@@ -0,0 +1,48 @@
+//===- Obsolete.cpp - Obsolete libclang functions and types -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------------===//
+//
+// This file contains libclang symbols whose underlying functionality has been
+// removed from Clang, but which need to be kept around so as to retain ABI
+// compatibility.
+//
+//===--------------------------------------------------------------------===//
+
+#include "clang-c/CXString.h"
+#include "clang-c/Index.h"
+#include "clang-c/Platform.h"
+#include "llvm/Support/raw_ostream.h"
+
+extern "C" {
+
+// The functions below used to be part of the C API for ARCMigrate, which has
+// since been removed from Clang; they already used to print an error if Clang
+// was compiled without arcmt support, so we continue doing so.
+CXRemapping clang_getRemappings(const char *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+CXRemapping clang_getRemappingsFromFileList(const char **, unsigned) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+unsigned clang_remap_getNumFiles(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return 0;
+}
+
+void clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+void clang_remap_dispose(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+} // extern "C"
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index d140a71e771a..3d9d2e268a61 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -327,6 +327,8 @@ LLVM_13 {
     clang_getRange;
     clang_getRangeEnd;
     clang_getRangeStart;
+    clang_getRemappings;
+    clang_getRemappingsFromFileList;
     clang_getResultType;
     clang_getSkippedRanges;
     clang_getSpecializedCursorTemplate;
@@ -387,6 +389,9 @@ LLVM_13 {
     clang_parseTranslationUnit;
     clang_parseTranslationUnit2;
     clang_parseTranslationUnit2FullArgv;
+    clang_remap_dispose;
+    clang_remap_getFilenames;
+    clang_remap_getNumFiles;
     clang_reparseTranslationUnit;
     clang_saveTranslationUnit;
     clang_sortCodeCompletionResults;
@@ -435,12 +440,12 @@ LLVM_20 {
     clang_getTypePrettyPrinted;
     clang_isBeforeInTranslationUnit;
     clang_visitCXXBaseClasses;
-    clang_visitCXXMethods;
 };
 
 LLVM_21 {
   global:
     clang_getFullyQualifiedName;
+    clang_visitCXXMethods;
     clang_Cursor_getGCCAssemblyTemplate;
     clang_Cursor_isGCCAssemblyHasGoto;
     clang_Cursor_getGCCAssemblyNumOutputs;
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index d17109aebc0f..2b17c36f6aa8 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -249,6 +249,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions,
                           AfterFunctionDefinitionName);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterIfMacros);
+  CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterNot);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterOverloadedOperator);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterPlacementOperator);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, BeforeNonEmptyParentheses);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0bc1c6d45656..95682f2d8cfd 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -4050,6 +4050,10 @@ TEST_F(FormatTest, FormatsBitfields) {
                "  uchar : 8;\n"
                "  uchar other;\n"
                "};");
+  verifyFormat("struct foo {\n"
+               "  uint8_t i_am_a_bit_field_this_long\n"
+               "      : struct_with_constexpr::i_am_a_constexpr_lengthhhhh;\n"
+               "};");
   FormatStyle Style = getLLVMStyle();
   Style.BitFieldColonSpacing = FormatStyle::BFCS_None;
   verifyFormat("struct Bitfields {\n"
@@ -8571,10 +8575,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
                "operator<<(const SomeLooooooooooooooooooooooooogType &other);");
   verifyGoogleFormat(
       "SomeLoooooooooooooooooooooooooooooogType operator>>(\n"
-      "    const SomeLooooooooogType &a, const SomeLooooooooogType &b);");
+      "    const SomeLooooooooogType& a, const SomeLooooooooogType& b);");
   verifyGoogleFormat(
       "SomeLoooooooooooooooooooooooooooooogType operator<<(\n"
-      "    const SomeLooooooooogType &a, const SomeLooooooooogType &b);");
+      "    const SomeLooooooooogType& a, const SomeLooooooooogType& b);");
 
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa = 1);");
@@ -8583,7 +8587,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyGoogleFormat(
       "typename aaaaaaaaaa<aaaaaa>::aaaaaaaaaaa\n"
       "aaaaaaaaaa<aaaaaa>::aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
-      "    bool *aaaaaaaaaaaaaaaaaa, bool *aa) {}");
+      "    bool* aaaaaaaaaaaaaaaaaa, bool* aa) {}");
   verifyGoogleFormat("template <typename T>\n"
                      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                      "aaaaaaaaaaaaaaaaaaaaaaa<T>::aaaaaaaaaaaaa(\n"
@@ -8592,7 +8596,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyFormat("extern \"C\" //\n"
                "    void f();");
 
-  FormatStyle Style = getLLVMStyle();
+  auto Style = getLLVMStyle();
   Style.PointerAlignment = FormatStyle::PAS_Left;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaa* const aaaaaaaaaaaa) {}",
@@ -8600,6 +8604,14 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyFormat("void aaaaaaa(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa*\n"
                "                 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {}",
                Style);
+
+  Style = getLLVMStyleWithColumns(45);
+  Style.PenaltyReturnTypeOnItsOwnLine = 400;
+  verifyFormat("template <bool abool, // a comment\n"
+               "          bool anotherbool>\n"
+               "static inline std::pair<size_t, MyCustomType>\n"
+               "myfunc(const char *buf, const char *&err);",
+               Style);
 }
 
 TEST_F(FormatTest, DontBreakBeforeQualifiedOperator) {
@@ -12891,27 +12903,31 @@ TEST_F(FormatTest, UnderstandsEllipsis) {
 }
 
 TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
+  auto Style = getGoogleStyle();
+  EXPECT_FALSE(Style.DerivePointerAlignment);
+  Style.DerivePointerAlignment = true;
+
   verifyFormat("int *a;\n"
                "int *a;\n"
                "int *a;",
                "int *a;\n"
                "int* a;\n"
                "int *a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("int* a;\n"
                "int* a;\n"
                "int* a;",
                "int* a;\n"
                "int* a;\n"
                "int *a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("int *a;\n"
                "int *a;\n"
                "int *a;",
                "int *a;\n"
                "int * a;\n"
                "int *  a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("auto x = [] {\n"
                "  int *a;\n"
                "  int *a;\n"
@@ -12920,7 +12936,7 @@ TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
                "auto x=[]{int *a;\n"
                "int * a;\n"
                "int *  a;};",
-               getGoogleStyle());
+               Style);
 }
 
 TEST_F(FormatTest, UnderstandsRvalueReferences) {
@@ -13056,7 +13072,7 @@ TEST_F(FormatTest, FormatsCasts) {
   verifyFormat("virtual void foo(char &) const;");
   verifyFormat("virtual void foo(int *a, char *) const;");
   verifyFormat("int a = sizeof(int *) + b;");
-  verifyGoogleFormat("int a = alignof(int *) + b;");
+  verifyGoogleFormat("int a = alignof(int*) + b;");
   verifyFormat("bool b = f(g<int>) && c;");
   verifyFormat("typedef void (*f)(int i) func;");
   verifyFormat("void operator++(int) noexcept;");
@@ -17652,6 +17668,12 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) {
   verifyFormat("int x = int (y);", SomeSpace2);
   verifyFormat("auto lambda = []() { return 0; };", SomeSpace2);
 
+  auto Style = getLLVMStyle();
+  Style.SpaceBeforeParens = FormatStyle::SBPO_Custom;
+  EXPECT_FALSE(Style.SpaceBeforeParensOptions.AfterNot);
+  Style.SpaceBeforeParensOptions.AfterNot = true;
+  verifyFormat("return not (a || b);", Style);
+
   FormatStyle SpaceAfterOverloadedOperator = getLLVMStyle();
   SpaceAfterOverloadedOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom;
   SpaceAfterOverloadedOperator.SpaceBeforeParensOptions
@@ -25419,7 +25441,7 @@ TEST_F(FormatTest, AtomicQualifier) {
   verifyFormat("struct foo {\n"
                "  int a1;\n"
                "  _Atomic(a) a2;\n"
-               "  _Atomic(_Atomic(int) *const) a3;\n"
+               "  _Atomic(_Atomic(int)* const) a3;\n"
                "};",
                Google);
   verifyFormat("_Atomic(uint64_t) a;");
diff --git a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
index b1e42e924e05..67b9cc903790 100644
--- a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
+++ b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
@@ -83,6 +83,9 @@ TEST_F(IntegerLiteralSeparatorTest, SingleQuoteAsSeparator) {
                "d = 5678_km;\n"
                "h = 0xDEF_u16;",
                Style);
+
+  Style.Standard = FormatStyle::LS_Cpp11;
+  verifyFormat("ld = 1234L;", Style);
 }
 
 TEST_F(IntegerLiteralSeparatorTest, UnderscoreAsSeparator) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index e281a4945a86..af94841c820a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -390,6 +390,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
   EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("bool foo = requires { static_cast<Foo &&>(1); };");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
+
   Tokens = annotate("return s.operator int *();");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   // Not TT_FunctionDeclarationName.
@@ -614,6 +618,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace);
 
+  Tokens = annotate("class Outer {\n"
+                    "  struct Inner final : Base {};\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_Unknown); // Not TT_StartOfName
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_InheritanceColon);
+
   constexpr StringRef Code{"struct EXPORT StructName {};"};
 
   Tokens = annotate(Code);
diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt
index 96ca6dda9cd8..fa5e58f5a893 100644
--- a/clang/unittests/Lex/CMakeLists.txt
+++ b/clang/unittests/Lex/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_unittest(LexTests
   LexerTest.cpp
   LexHLSLRootSignatureTest.cpp
   ModuleDeclStateTest.cpp
+  NoTrivialPPDirectiveTracerTest.cpp
   PPCallbacksTest.cpp
   PPConditionalDirectiveRecordTest.cpp
   PPDependencyDirectivesTest.cpp
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 86df872f6b7d..bb6404c43448 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -796,7 +796,7 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
                                     PP->getSourceManager(), PP->getLangOpts(),
                                     /*IgnoreWhiteSpace=*/false));
-    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation());
     EXPECT_TRUE(Tok.is(tok::hash));
   }
 
@@ -812,7 +812,7 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
                                     PP->getSourceManager(), PP->getLangOpts(),
                                     /*IgnoreWhiteSpace=*/false));
-    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation());
     EXPECT_TRUE(Tok.is(tok::raw_identifier));
     EXPECT_TRUE(Tok.getRawIdentifier() == "FOO");
   }
diff --git a/clang/unittests/Lex/ModuleDeclStateTest.cpp b/clang/unittests/Lex/ModuleDeclStateTest.cpp
index 6ecba4de3187..0c03cfd6d0f8 100644
--- a/clang/unittests/Lex/ModuleDeclStateTest.cpp
+++ b/clang/unittests/Lex/ModuleDeclStateTest.cpp
@@ -61,14 +61,15 @@ class ModuleDeclStateTest : public ::testing::Test {
     Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts);
   }
 
-  std::unique_ptr<Preprocessor>
-  getPreprocessor(const char *source, Language Lang) {
+  std::unique_ptr<Preprocessor> getPreprocessor(const char *source,
+                                                Language Lang) {
     std::unique_ptr<llvm::MemoryBuffer> Buf =
         llvm::MemoryBuffer::getMemBuffer(source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
     std::vector<std::string> Includes;
-    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes, LangStandard::lang_cxx20);
+    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes,
+                                 LangStandard::lang_cxx20);
     LangOpts.CPlusPlusModules = true;
     if (Lang != Language::CXX) {
       LangOpts.Modules = true;
@@ -113,12 +114,11 @@ export module foo;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -132,12 +132,11 @@ module foo;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_TRUE(PP->isInImplementationUnit());
@@ -151,12 +150,11 @@ module foo:part;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -170,12 +168,11 @@ export module foo:part;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -189,12 +186,11 @@ export module foo.dot:part.dot;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -208,12 +204,11 @@ TEST_F(ModuleDeclStateTest, NotModule) {
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -234,12 +229,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -261,12 +255,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -287,12 +280,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -305,12 +297,11 @@ TEST_F(ModuleDeclStateTest, ImportAClangNamedModule) {
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::ObjCXX);
 
   std::initializer_list<bool> ImportKinds = {false};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -327,12 +318,11 @@ import M2;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::ObjCXX);
 
   std::initializer_list<bool> ImportKinds = {false, true, false, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)4);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)4);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
diff --git a/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp
new file mode 100644
index 000000000000..8b546fe2a40e
--- /dev/null
+++ b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp
@@ -0,0 +1,183 @@
+//===- unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/HeaderSearchOptions.h"
+#include "clang/Lex/ModuleLoader.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "gtest/gtest.h"
+#include <cstddef>
+#include <initializer_list>
+
+using namespace clang;
+
+namespace {
+class NoTrivialPPDirectiveTracerTest : public ::testing::Test {
+protected:
+  NoTrivialPPDirectiveTracerTest()
+      : VFS(llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>()),
+        FileMgr(FileMgrOpts, VFS), DiagID(new DiagnosticIDs()),
+        Diags(DiagID, DiagOpts, new IgnoringDiagConsumer()),
+        SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) {
+    TargetOpts->Triple = "x86_64-unknown-linux-gnu";
+    Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts);
+  }
+
+  void addFile(const char *source, StringRef Filename) {
+    VFS->addFile(Filename, 0, llvm::MemoryBuffer::getMemBuffer(source),
+                 /*User=*/std::nullopt,
+                 /*Group=*/std::nullopt,
+                 llvm::sys::fs::file_type::regular_file);
+  }
+
+  std::unique_ptr<Preprocessor> getPreprocessor(const char *source,
+                                                Language Lang) {
+    std::unique_ptr<llvm::MemoryBuffer> Buf =
+        llvm::MemoryBuffer::getMemBuffer(source);
+    SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
+
+    std::vector<std::string> Includes;
+    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes,
+                                 LangStandard::lang_cxx20);
+    LangOpts.CPlusPlusModules = true;
+    if (Lang != Language::CXX) {
+      LangOpts.Modules = true;
+      LangOpts.ImplicitModules = true;
+    }
+
+    HeaderInfo.emplace(HSOpts, SourceMgr, Diags, LangOpts, Target.get());
+
+    auto DE = FileMgr.getOptionalDirectoryRef(".");
+    assert(DE);
+    auto DL = DirectoryLookup(*DE, SrcMgr::C_User, /*isFramework=*/false);
+    HeaderInfo->AddSearchPath(DL, /*isAngled=*/false);
+
+    return std::make_unique<Preprocessor>(PPOpts, Diags, LangOpts, SourceMgr,
+                                          *HeaderInfo, ModLoader,
+                                          /*IILookup=*/nullptr,
+                                          /*OwnsHeaderSearch=*/false);
+  }
+
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> VFS;
+  FileSystemOptions FileMgrOpts;
+  FileManager FileMgr;
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID;
+  DiagnosticOptions DiagOpts;
+  DiagnosticsEngine Diags;
+  SourceManager SourceMgr;
+  std::shared_ptr<TargetOptions> TargetOpts;
+  IntrusiveRefCntPtr<TargetInfo> Target;
+  LangOptions LangOpts;
+  TrivialModuleLoader ModLoader;
+  HeaderSearchOptions HSOpts;
+  std::optional<HeaderSearch> HeaderInfo;
+  PreprocessorOptions PPOpts;
+};
+
+TEST_F(NoTrivialPPDirectiveTracerTest, TrivialDirective) {
+  const char *source = R"(
+    #line 7
+    # 1 __FILE__ 1 3
+    #ident "$Header:$"
+    #pragma comment(lib, "msvcrt.lib")
+    #pragma mark LLVM's world
+    #pragma detect_mismatch("test", "1")
+    #pragma clang __debug dump Test
+    #pragma message "test"
+    #pragma GCC warning "Foo"
+    #pragma GCC error "Foo"
+    #pragma gcc diagnostic push
+    #pragma gcc diagnostic pop
+    #pragma GCC diagnostic ignored "-Wframe-larger-than"
+    #pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable
+    #pragma warning(push)
+    #pragma warning(pop)
+    #pragma execution_character_set(push, "UTF-8")
+    #pragma execution_character_set(pop)
+    #pragma clang assume_nonnull begin
+    #pragma clang assume_nonnull end
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_FALSE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, IncludeDirective) {
+  const char *source = R"(
+    #include "header.h"
+    int foo;
+  )";
+  const char *header = R"(
+    #ifndef HEADER_H
+    #define HEADER_H
+    #endif // HEADER_H
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  addFile(header, "header.h");
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, DefineDirective) {
+  const char *source = R"(
+    #define FOO
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, UnDefineDirective) {
+  const char *source = R"(
+    #undef FOO
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->setPredefines("#define FOO");
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, IfDefinedDirective) {
+  const char *source = R"(
+    #if defined(FOO)
+    #endif
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->setPredefines("#define FOO");
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+} // namespace
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index f14aae172f07..fe607fff96ee 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -4,12 +4,12 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 21)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX)
 endif()
 
diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S
index d7c1db7243ef..a444d82892c3 100644
--- a/compiler-rt/lib/builtins/aarch64/lse.S
+++ b/compiler-rt/lib/builtins/aarch64/lse.S
@@ -264,7 +264,7 @@ END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM))
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif // defined(__aarch64__) || defined(__arm64ec__)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 7c47336cfc57..d5510ac0cfa5 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -371,5 +371,5 @@ END_COMPILERRT_FUNCTION(__arm_sme_restore)
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 89372f18c84b..d7db7d818945 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -79,11 +79,12 @@
 #define FUNC_ALIGN
 #endif
 
-// BTI and PAC gnu property note
+// BTI, PAC, and GCS gnu property note
 #define NT_GNU_PROPERTY_TYPE_0 5
 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
+#define GNU_PROPERTY_AARCH64_FEATURE_1_GCS 4
 
 #if defined(__ARM_FEATURE_BTI_DEFAULT)
 #define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
@@ -97,6 +98,12 @@
 #define PAC_FLAG 0
 #endif
 
+#if defined(__ARM_FEATURE_GCS_DEFAULT)
+#define GCS_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+#else
+#define GCS_FLAG 0
+#endif
+
 #define GNU_PROPERTY(type, value)                                              \
   .pushsection .note.gnu.property, "a" SEPARATOR                               \
   .p2align 3 SEPARATOR                                                         \
@@ -118,11 +125,12 @@
 #define BTI_J
 #endif
 
-#if (BTI_FLAG | PAC_FLAG) != 0
-#define GNU_PROPERTY_BTI_PAC                                                   \
-  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
+#if (BTI_FLAG | PAC_FLAG | GCS_FLAG) != 0
+#define GNU_PROPERTY_BTI_PAC_GCS                                               \
+  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND,                             \
+               BTI_FLAG | PAC_FLAG | GCS_FLAG)
 #else
-#define GNU_PROPERTY_BTI_PAC
+#define GNU_PROPERTY_BTI_PAC_GCS
 #endif
 
 #if defined(__clang__) || defined(__GCC_HAVE_DWARF2_CFI_ASM)
diff --git a/compiler-rt/lib/builtins/crtbegin.c b/compiler-rt/lib/builtins/crtbegin.c
index d5f7756308b0..447474bd0b69 100644
--- a/compiler-rt/lib/builtins/crtbegin.c
+++ b/compiler-rt/lib/builtins/crtbegin.c
@@ -54,22 +54,33 @@ static void __attribute__((used)) __do_init(void) {
 }
 
 #ifdef CRT_HAS_INITFINI_ARRAY
-#if __has_feature(ptrauth_init_fini)
+# if __has_feature(ptrauth_init_fini)
 // TODO: use __ptrauth-qualified pointers when they are supported on clang side
-#if __has_feature(ptrauth_init_fini_address_discrimination)
+#  if __has_feature(ptrauth_init_fini_address_discrimination)
 __attribute__((section(".init_array"), used)) static void *__init =
     ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
                           ptrauth_blend_discriminator(
                               &__init, __ptrauth_init_fini_discriminator));
-#else
+#  else
 __attribute__((section(".init_array"), used)) static void *__init =
     ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
                           __ptrauth_init_fini_discriminator);
-#endif
-#else
+#  endif
+# elif __has_feature(ptrauth_calls)
+#  ifdef __aarch64__
+// If ptrauth_init_fini feature is not present, compiler emits raw unsigned
+// pointers in .init_array. Use inline assembly to avoid implicit signing of
+// __do_init function pointer with ptrauth_calls enabled.
+__asm__(".pushsection .init_array,\"aw\",@init_array\n\t"
+        ".xword __do_init\n\t"
+        ".popsection");
+#  else
+#   error "ptrauth_calls is only supported for AArch64"
+#  endif
+# else
 __attribute__((section(".init_array"),
                used)) static void (*__init)(void) = __do_init;
-#endif
+# endif
 #elif defined(__i386__) || defined(__x86_64__)
 __asm__(".pushsection .init,\"ax\",@progbits\n\t"
         "call __do_init\n\t"
@@ -125,22 +136,33 @@ static void __attribute__((used)) __do_fini(void) {
 }
 
 #ifdef CRT_HAS_INITFINI_ARRAY
-#if __has_feature(ptrauth_init_fini)
+# if __has_feature(ptrauth_init_fini)
 // TODO: use __ptrauth-qualified pointers when they are supported on clang side
-#if __has_feature(ptrauth_init_fini_address_discrimination)
+#  if __has_feature(ptrauth_init_fini_address_discrimination)
 __attribute__((section(".fini_array"), used)) static void *__fini =
     ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
                           ptrauth_blend_discriminator(
                               &__fini, __ptrauth_init_fini_discriminator));
-#else
+#  else
 __attribute__((section(".fini_array"), used)) static void *__fini =
     ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
                           __ptrauth_init_fini_discriminator);
-#endif
-#else
+#  endif
+# elif __has_feature(ptrauth_calls)
+#  ifdef __aarch64__
+// If ptrauth_init_fini feature is not present, compiler emits raw unsigned
+// pointers in .fini_array. Use inline assembly to avoid implicit signing of
+// __do_fini function pointer with ptrauth_calls enabled.
+__asm__(".pushsection .fini_array,\"aw\",@fini_array\n\t"
+        ".xword __do_fini\n\t"
+        ".popsection");
+#  else
+#   error "ptrauth_calls is only supported for AArch64"
+#  endif
+# else
 __attribute__((section(".fini_array"),
                used)) static void (*__fini)(void) = __do_fini;
-#endif
+# endif
 #elif defined(__i386__) || defined(__x86_64__)
 __asm__(".pushsection .fini,\"ax\",@progbits\n\t"
         "call __do_fini\n\t"
diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
index fd20825e3dac..825f41156509 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
@@ -11,4 +11,4 @@
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
index 0c0abb6de861..b8d98b09ada2 100644
--- a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
@@ -99,4 +99,4 @@ ASM_TRAMPOLINE_ALIAS(_setjmp, setjmp)
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
index fd060c51cd8e..be82475101c8 100644
--- a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
@@ -157,4 +157,4 @@ mismatch:
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index 08c2be47f535..673f284b6a04 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -344,12 +344,16 @@ static void ioctl_table_fill() {
   _(SOUND_PCM_WRITE_CHANNELS, WRITE, sizeof(int));
   _(SOUND_PCM_WRITE_FILTER, WRITE, sizeof(int));
   _(TCFLSH, NONE, 0);
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   _(TCGETS, WRITE, struct_termios_sz);
+#    endif
   _(TCSBRK, NONE, 0);
   _(TCSBRKP, NONE, 0);
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   _(TCSETS, READ, struct_termios_sz);
   _(TCSETSF, READ, struct_termios_sz);
   _(TCSETSW, READ, struct_termios_sz);
+#    endif
   _(TCXONC, NONE, 0);
   _(TIOCGLCKTRMIOS, WRITE, struct_termios_sz);
   _(TIOCGSOFTCAR, WRITE, sizeof(int));
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
index cdfa6f1d7f53..5066953980af 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
@@ -43,6 +43,6 @@ ASM_SIZE(vfork)
 ASM_INTERCEPTOR_TRAMPOLINE(vfork)
 ASM_TRAMPOLINE_ALIAS(vfork, vfork)
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 196c0a988478..13099fe84b0a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -482,4 +482,19 @@
 #  define SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL 0
 #endif
 
+#if SANITIZER_LINUX
+#  if SANITIZER_GLIBC
+// Workaround for
+// glibc/commit/3d3572f59059e2b19b8541ea648a6172136ec42e
+// Linux: Keep termios ioctl constants strictly internal
+#    if __GLIBC_PREREQ(2, 41)
+#      define SANITIZER_TERMIOS_IOCTL_CONSTANTS 0
+#    else
+#      define SANITIZER_TERMIOS_IOCTL_CONSTANTS 1
+#    endif
+#  else
+#    define SANITIZER_TERMIOS_IOCTL_CONSTANTS 1
+#  endif
+#endif
+
 #endif  // SANITIZER_PLATFORM_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 7a89bf1c7498..ea8cc306268c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -779,16 +779,16 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned IOCTL_SOUND_PCM_WRITE_FILTER = SOUND_PCM_WRITE_FILTER;
 #endif // SOUND_VERSION
   unsigned IOCTL_TCFLSH = TCFLSH;
-  unsigned IOCTL_TCGETA = TCGETA;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   unsigned IOCTL_TCGETS = TCGETS;
+#    endif
   unsigned IOCTL_TCSBRK = TCSBRK;
   unsigned IOCTL_TCSBRKP = TCSBRKP;
-  unsigned IOCTL_TCSETA = TCSETA;
-  unsigned IOCTL_TCSETAF = TCSETAF;
-  unsigned IOCTL_TCSETAW = TCSETAW;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   unsigned IOCTL_TCSETS = TCSETS;
   unsigned IOCTL_TCSETSF = TCSETSF;
   unsigned IOCTL_TCSETSW = TCSETSW;
+#    endif
   unsigned IOCTL_TCXONC = TCXONC;
   unsigned IOCTL_TIOCGLCKTRMIOS = TIOCGLCKTRMIOS;
   unsigned IOCTL_TIOCGSOFTCAR = TIOCGSOFTCAR;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index a2b6c37d5450..f118d53f0df8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -102,6 +102,8 @@ const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
                                            ? FIRST_32_SECOND_64(104, 128)
 #      if defined(_ABIN32) && _MIPS_SIM == _ABIN32
                                            : FIRST_32_SECOND_64(176, 216);
+#      elif SANITIZER_MUSL
+                                           : FIRST_32_SECOND_64(160, 208);
 #      else
                                            : FIRST_32_SECOND_64(160, 216);
 #      endif
@@ -1312,16 +1314,14 @@ extern unsigned IOCTL_SNDCTL_COPR_SENDMSG;
 extern unsigned IOCTL_SNDCTL_COPR_WCODE;
 extern unsigned IOCTL_SNDCTL_COPR_WDATA;
 extern unsigned IOCTL_TCFLSH;
-extern unsigned IOCTL_TCGETA;
-extern unsigned IOCTL_TCGETS;
 extern unsigned IOCTL_TCSBRK;
 extern unsigned IOCTL_TCSBRKP;
-extern unsigned IOCTL_TCSETA;
-extern unsigned IOCTL_TCSETAF;
-extern unsigned IOCTL_TCSETAW;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
+extern unsigned IOCTL_TCGETS;
 extern unsigned IOCTL_TCSETS;
 extern unsigned IOCTL_TCSETSF;
 extern unsigned IOCTL_TCSETSW;
+#    endif
 extern unsigned IOCTL_TCXONC;
 extern unsigned IOCTL_TIOCGLCKTRMIOS;
 extern unsigned IOCTL_TIOCGSOFTCAR;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 05065444a70c..612317b3c329 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -183,7 +183,8 @@ TEST_F(ScudoWrappersCDeathTest, Malloc) {
   // process doing free(P) is not a double free.
   EXPECT_DEATH(
       {
-        void *Ptr = malloc(Size);
+        // Note: volatile here prevents the calls from being optimized out.
+        void *volatile Ptr = malloc(Size);
         free(Ptr);
         free(Ptr);
       },
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
index 7d920bee4a2d..f1d11a3e7f54 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -222,6 +222,6 @@ ASM_SIZE(ASM_SYMBOL_INTERCEPTOR(__sigsetjmp))
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 877718c703ba..f5576ce0e013 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -503,7 +503,7 @@ def get_ios_commands_dir():
 # Define %arch to check for architecture-dependent output.
 config.substitutions.append(("%arch", (config.host_arch)))
 
-if config.host_os == "Windows":
+if os.name == "nt":
     # FIXME: This isn't quite right. Specifically, it will succeed if the program
     # does not crash but exits with a non-zero exit code. We ought to merge
     # KillTheDoctor and not --crash to make the latter more useful and remove the
diff --git a/cross-project-tests/CMakeLists.txt b/cross-project-tests/CMakeLists.txt
index b4b1f4762607..192db8704317 100644
--- a/cross-project-tests/CMakeLists.txt
+++ b/cross-project-tests/CMakeLists.txt
@@ -19,11 +19,12 @@ set(CROSS_PROJECT_TEST_DEPS
   FileCheck
   check-gdb-llvm-support
   count
-  llvm-dwarfdump
+  llvm-ar
   llvm-config
+  llvm-dwarfdump
   llvm-objdump
-  split-file
   not
+  split-file
   )
 
 if ("clang" IN_LIST LLVM_ENABLE_PROJECTS)
diff --git a/cross-project-tests/dtlto/ld-archive-thin.test b/cross-project-tests/dtlto/ld-archive-thin.test
new file mode 100644
index 000000000000..979da5423962
--- /dev/null
+++ b/cross-project-tests/dtlto/ld-archive-thin.test
@@ -0,0 +1,97 @@
+REQUIRES: ld.lld,llvm-ar
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: llvm-ar rcs foo.a foo.o --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/bar.a bar.o --thin
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: llvm-ar rcs dog.a %t/lib/../dog.o --thin
+## The bitcode member of cat.a will not be used in the link.
+RUN: llvm-ar rcs cat.a cat.o --thin
+RUN: llvm-ar rcs start.a start.o --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+RUN: %clang --target=x86_64-linux-gnu -flto=thin -fuse-ld=lld %t/foo.a %t/lib/bar.a ../start.a %t/cat.a \
+RUN:   -Wl,--whole-archive ../dog.a \
+RUN:   -fthinlto-distributor=%python \
+RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--save-temps -nostdlib -Werror
+
+## Check that the required output files have been created.
+RUN: ls | sort | FileCheck %s
+
+## No files are expected before.
+CHECK-NOT: {{.}}
+
+## JSON jobs description.
+CHECK: {{^}}a.[[PID:[a-zA-Z0-9_]+]].dist-file.json{{$}}
+
+## Native output object files and individual summary index files.
+CHECK: {{^}}bar.3.[[PID]].native.o{{$}}
+CHECK: {{^}}bar.3.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## No files are expected after.
+CHECK-NOT: {{.}}
+
+
+## It is important that cross-module inlining occurs for this test to show that Clang can
+## successfully load the bitcode file dependencies recorded in the summary indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), _start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + _start(x); }
+
+#--- bar.c
+extern int foo(int), _start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + _start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), _start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + _start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int _start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/dtlto/ld-dtlto.c b/cross-project-tests/dtlto/ld-dtlto.c
index 3ee962346bd4..7dffe2e015bc 100644
--- a/cross-project-tests/dtlto/ld-dtlto.c
+++ b/cross-project-tests/dtlto/ld-dtlto.c
@@ -5,13 +5,11 @@
 
 // RUN: rm -rf %t && mkdir %t && cd %t
 
-// RUN: %clang --target=x86_64-linux-gnu -c -flto=thin %s -o dtlto.o
-
-// RUN: ld.lld dtlto.o \
-// RUN:   --thinlto-distributor=%python \
-// RUN:   --thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
-// RUN:   --thinlto-remote-compiler=%clang \
-// RUN:   --thinlto-remote-compiler-arg=--save-temps
+// RUN: %clang --target=x86_64-linux-gnu %s -flto=thin -fuse-ld=lld \
+// RUN:   -fthinlto-distributor=%python \
+// RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -Wl,--thinlto-remote-compiler-arg=--save-temps \
+// RUN:   -nostdlib -Werror
 
 /// Check that the required output files have been created.
 // RUN: ls | sort | FileCheck %s
@@ -22,18 +20,15 @@
 /// Linked ELF.
 // CHECK: {{^}}a.out{{$}}
 
-/// Produced by the bitcode compilation.
-// CHECK-NEXT: {{^}}dtlto.o{{$}}
-
 /// --save-temps output for the backend compilation.
-// CHECK-NEXT: {{^}}dtlto.s{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP:[a-zA-Z0-9_]+]].s{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.resolution.txt{{$}}
 
 /// No files are expected after.
 // CHECK-NOT: {{.}}
diff --git a/cross-project-tests/dtlto/link-archive-thin.test b/cross-project-tests/dtlto/link-archive-thin.test
new file mode 100644
index 000000000000..fbd8fd67300c
--- /dev/null
+++ b/cross-project-tests/dtlto/link-archive-thin.test
@@ -0,0 +1,93 @@
+REQUIRES: lld-link
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-pc-windows-msvc -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: lld-link /lib /llvmlibthin /out:foo.lib foo.o
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: lld-link /lib /llvmlibthin /out:lib/bar.lib bar.o
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: lld-link /lib /llvmlibthin /out:dog.lib %t/lib/../dog.o
+RUN: lld-link /lib /llvmlibthin /out:cat.lib cat.o
+RUN: lld-link /lib /llvmlibthin /out:start.lib start.o
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+RUN: lld-link /subsystem:console /machine:x64 /entry:start /out:my.exe  \
+RUN:   %t/foo.lib %t/lib/bar.lib ../start.lib %t/cat.lib \
+RUN:   /includeoptional:dog ../dog.lib \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+RUN:   -thinlto-remote-compiler:%clang \
+RUN:   /lldsavetemps
+
+## Check that the required output files have been created.
+RUN: ls | FileCheck %s --check-prefix=OUTPUTS --implicit-check-not=cat
+
+## JSON jobs description.
+OUTPUTS-DAG: my.[[PID:[a-zA-Z0-9_]+]].dist-file.json
+
+## Individual summary index files.
+OUTPUTS-DAG: start.1.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## Native output object files.
+OUTPUTS-DAG: start.1.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o{{$}}
+
+
+## It is important that cross-module inlining occurs for this test to show that Clang can
+## successfully load the bitcode file dependencies recorded in the summary indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + start(x); }
+
+#--- bar.c
+extern int foo(int), start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/dtlto/link-dtlto.c b/cross-project-tests/dtlto/link-dtlto.c
new file mode 100644
index 000000000000..0ab4ec57f115
--- /dev/null
+++ b/cross-project-tests/dtlto/link-dtlto.c
@@ -0,0 +1,41 @@
+// REQUIRES: lld-link
+
+/// Simple test that DTLTO works with a single input bitcode file and that
+/// --save-temps can be applied to the remote compilation.
+
+// RUN: rm -rf %t && mkdir %t && cd %t
+
+// RUN: %clang --target=x86_64-pc-windows-msvc -c -flto=thin %s -o dtlto.obj
+
+// RUN: lld-link /subsystem:console /entry:_start dtlto.obj \
+// RUN:   -thinlto-distributor:%python \
+// RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -thinlto-remote-compiler:%clang \
+// RUN:   -thinlto-remote-compiler-arg:--save-temps
+
+/// Check that the required output files have been created.
+// RUN: ls | sort | FileCheck %s
+
+/// No files are expected before.
+// CHECK-NOT: {{.}}
+
+/// Linked ELF.
+// CHECK: {{^}}dtlto.exe{{$}}
+
+/// Produced by the bitcode compilation.
+// CHECK-NEXT: {{^}}dtlto.obj{{$}}
+
+/// --save-temps output for the backend compilation.
+// CHECK-NEXT: {{^}}dtlto.s{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+
+/// No files are expected after.
+// CHECK-NOT: {{.}}
+
+int _start() { return 0; }
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index b35c643ac898..ac2775347264 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -19,7 +19,7 @@
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = [".c", ".cl", ".cpp", ".m"]
+config.suffixes = [".c", ".cl", ".cpp", ".m", ".test"]
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
@@ -107,6 +107,8 @@ def get_required_attr(config, attr_name):
 if lldb_path is not None:
     config.available_features.add("lldb")
 
+if llvm_config.use_llvm_tool("llvm-ar"):
+    config.available_features.add("llvm-ar")
 
 def configure_dexter_substitutions():
     """Configure substitutions for host platform and return list of dependencies"""
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index e51590fdae3d..58541609829c 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -286,27 +286,6 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS)
     endif ()
 
-    # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI
-    # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt
-    # functions in some cases like 128-bit integer math (__udivti3, __modti3,
-    # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a
-    # dependency to Compiler-RT's builtin library where these are implemented.
-    if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      if (FLANG_RT_BUILTINS_LIBRARY)
-        target_compile_options(${tgtname} PRIVATE "$<$<COMPILE_LANGUAGE:CXX,C>:-Xclang>" "$<$<COMPILE_LANGUAGE:CXX,C>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
-      endif ()
-    endif ()
-    if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-      if (FLANG_RT_BUILTINS_LIBRARY)
-        target_compile_options(${tgtname} PRIVATE "$<$<COMPILE_LANGUAGE:Fortran>:-Xflang>" "$<$<COMPILE_LANGUAGE:Fortran>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
-      else ()
-        message(WARNING "Did not find libclang_rt.builtins.lib.
-          LLVM may emit builtins that are not implemented in msvcrt/ucrt and
-          instead falls back to builtins from Compiler-RT. Linking with ${tgtname}
-          may result in a linker error.")
-      endif ()
-    endif ()
-
     # Non-GTest unittests depend on LLVMSupport
     if (ARG_LINK_TO_LLVM)
       if (LLVM_LINK_LLVM_DYLIB)
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 332c0872e065..dc2db1d9902c 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -251,19 +251,33 @@ else()
   add_win_flangrt_runtime(STATIC dynamic     MultiThreadedDLL      INSTALL_WITH_TOOLCHAIN)
   add_win_flangrt_runtime(STATIC dynamic_dbg MultiThreadedDebugDLL INSTALL_WITH_TOOLCHAIN)
 
-  # Unittests link against LLVMSupport which is using CMake's default runtime
-  # library selection, which is either MultiThreadedDLL or MultiThreadedDebugDLL
-  # depending on the configuration. They have to match or linking will fail.
+  # Unittests link against LLVMSupport. If CMAKE_MSVC_RUNTIME_LIBRARY is set,
+  # that will have been used for LLVMSupport so it must also be used here.
+  # Otherwise this will use CMake's default runtime library selection, which
+  # is either MultiThreadedDLL or MultiThreadedDebugDLL depending on the configuration.
+  # They have to match or linking will fail.
   if (GENERATOR_IS_MULTI_CONFIG)
     # We cannot select an ALIAS library because it may be different
     # per configuration. Fallback to CMake's default.
     add_win_flangrt_runtime(STATIC unittest "" EXCLUDE_FROM_ALL)
   else ()
-    string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
-    if (build_type STREQUAL "debug")
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
-    else ()
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
-    endif ()
+    # Check if CMAKE_MSVC_RUNTIME_LIBRARY was set.
+    if (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreaded")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebug")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static_dbg)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebugDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+    else()
+      # Default based on the build type.
+      string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
+      if (build_type STREQUAL "debug")
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+      else ()
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+      endif ()
+    endif()
   endif ()
 endif()
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
index 528219617413..fd63ad11dcf4 100644
--- a/flang-rt/unittests/CMakeLists.txt
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -60,6 +60,27 @@ function(add_flangrt_unittest_offload_properties target)
   endif()
 endfunction()
 
+# flang-rt on Windows requires compiler-rt for some symbols. For binaries built
+# with flang this dependency is added by the flang driver, but since the unit
+# tests are built with clang we need to add the dependency manually.
+function(add_flangrt_dependent_libs target)
+  if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CXX,C>:-Xclang>" "$<$<COMPILE_LANGUAGE:CXX,C>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    endif ()
+  endif ()
+  if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:Fortran>:-Xflang>" "$<$<COMPILE_LANGUAGE:Fortran>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    else ()
+      message(WARNING "Did not find libclang_rt.builtins.lib.
+        LLVM may emit builtins that are not implemented in msvcrt/ucrt and
+        instead falls back to builtins from Compiler-RT. Linking with ${tgtname}
+        may result in a linker error.")
+    endif ()
+  endif ()
+endfunction()
+
 
 function(add_flangrt_unittest test_dirname)
   cmake_parse_arguments(ARG
@@ -72,14 +93,7 @@ function(add_flangrt_unittest test_dirname)
 
   target_link_libraries(${test_dirname} PRIVATE ${ARG_LINK_LIBS})
   add_flangrt_unittest_offload_properties(${test_dirname})
-
-  # Required because LLVMSupport is compiled with this option.
-  # FIXME: According to CMake documentation, this is the default. Why is it
-  #        needed? LLVM's add_unittest doesn't set it either.
-  set_target_properties(${test_dirname}
-      PROPERTIES
-        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL"
-    )
+  add_flangrt_dependent_libs(${test_dirname})
 endfunction()
 
 function(add_flangrt_nongtest_unittest test_name)
@@ -99,6 +113,7 @@ function(add_flangrt_nongtest_unittest test_name)
   set_target_properties(${test_name}${suffix} PROPERTIES FOLDER "Flang-RT/Tests/Unit")
 
   target_link_libraries(${test_name}${suffix} PRIVATE NonGTestTesting ${ARG_LINK_LIBS})
+  add_flangrt_dependent_libs(${test_name}${suffix})
 
   if(NOT ARG_SLOW_TEST)
     add_dependencies(FlangRTUnitTests ${test_name}${suffix})
diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md
index c9f19c37fd7f..81f5f9f6dee5 100644
--- a/flang/docs/OpenMPSupport.md
+++ b/flang/docs/OpenMPSupport.md
@@ -41,7 +41,7 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | target construct                                           | P      | device clause not supported |
 | target update construct                                    | P      | device clause not supported |
 | declare target directive                                   | P      | |
-| teams construct                                            | P      | reduction clause not supported |
+| teams construct                                            | Y      | |
 | distribute construct                                       | P      | dist_schedule clause not supported |
 | distribute simd construct                                  | P      | dist_schedule and linear clauses are not supported |
 | distribute parallel loop construct                         | P      | dist_schedule clause not supported |
@@ -51,15 +51,15 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | atomic construct extensions                                | Y      | |
 | cancel construct                                           | Y      | |
 | cancellation point construct                               | Y      | |
-| parallel do simd construct                                 | P      | linear clause is not supported |
-| target teams construct                                     | P      | device and reduction clauses are not supported |
-| teams distribute construct                                 | P      | reduction and dist_schedule clauses not supported |
-| teams distribute simd construct                            | P      | reduction, dist_schedule and linear clauses are not supported |
-| target teams distribute construct                          | P      | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop construct                   | P      | reduction and dist_schedule clauses are not supported |
-| target teams distribute parallel loop construct            | P      | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop simd construct              | P      | reduction, dist_schedule, and linear clauses are not supported |
-| target teams distribute parallel loop simd construct       | P      | device, reduction, dist_schedule and linear clauses are not supported |
+| parallel do simd construct                                 | P      | linear clause not supported |
+| target teams construct                                     | P      | device clause not supported |
+| teams distribute construct                                 | P      | dist_schedule clause not supported |
+| teams distribute simd construct                            | P      | dist_schedule and linear clauses are not supported |
+| target teams distribute construct                          | P      | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop construct                   | P      | dist_schedule clause not supported |
+| target teams distribute parallel loop construct            | P      | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop simd construct              | P      | dist_schedule and linear clauses are not supported |
+| target teams distribute parallel loop simd construct       | P      | device, dist_schedule and linear clauses are not supported |
 
 ## Extensions
 ### ATOMIC construct
diff --git a/flang/include/flang/Lower/ConvertType.h b/flang/include/flang/Lower/ConvertType.h
index 179a68258404..3c726595c0f7 100644
--- a/flang/include/flang/Lower/ConvertType.h
+++ b/flang/include/flang/Lower/ConvertType.h
@@ -118,6 +118,9 @@ class ComponentReverseIterator {
   /// Advance iterator to the last components of the current type parent.
   const Fortran::semantics::DerivedTypeSpec &advanceToParentType();
 
+  /// Get the parent component symbol for the current type.
+  const Fortran::semantics::Symbol *getParentComponent() const;
+
 private:
   void setCurrentType(const Fortran::semantics::DerivedTypeSpec &derived);
   const Fortran::semantics::DerivedTypeSpec *currentParentType = nullptr;
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index 72d8a0096f51..0b49049d43ed 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -124,7 +124,7 @@ class ReductionProcessor {
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
   template <typename OpType, typename RedOperatorListTy>
-  static void processReductionArguments(
+  static bool processReductionArguments(
       mlir::Location currentLocation, lower::AbstractConverter &converter,
       const RedOperatorListTy &redOperatorList,
       llvm::SmallVectorImpl<mlir::Value> &reductionVars,
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 33c1f1e7a3c3..2105fc14b0a6 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2123,9 +2123,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     llvm::SmallVector<mlir::Value> reduceVars;
     Fortran::lower::omp::ReductionProcessor rp;
-    rp.processReductionArguments<fir::DeclareReductionOp>(
+    bool result = rp.processReductionArguments<fir::DeclareReductionOp>(
         toLocation(), *this, info.reduceOperatorList, reduceVars,
         reduceVarByRef, reductionDeclSymbols, info.reduceSymList);
+    assert(result && "Failed to process `do concurrent` reductions");
 
     doConcurrentLoopOp.getReduceVarsMutable().assign(reduceVars);
     doConcurrentLoopOp.setReduceSymsAttr(
@@ -5504,10 +5505,34 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void genFIR(const Fortran::parser::AssignStmt &stmt) {
     const Fortran::semantics::Symbol &symbol =
         *std::get<Fortran::parser::Name>(stmt.t).symbol;
+
     mlir::Location loc = toLocation();
+    mlir::Type symbolType = genType(symbol);
+    mlir::Value addr = getSymbolAddress(symbol);
+
+    // Handle the case where the assigned variable is declared as a pointer
+    if (auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(symbolType)) {
+      if (auto ptrType = mlir::dyn_cast<fir::PointerType>(eleTy)) {
+        symbolType = ptrType.getEleTy();
+      } else {
+        symbolType = eleTy;
+      }
+    } else if (auto ptrType = mlir::dyn_cast<fir::PointerType>(symbolType)) {
+      symbolType = ptrType.getEleTy();
+    }
+
     mlir::Value labelValue = builder->createIntegerConstant(
-        loc, genType(symbol), std::get<Fortran::parser::Label>(stmt.t));
-    builder->create<fir::StoreOp>(loc, labelValue, getSymbolAddress(symbol));
+        loc, symbolType, std::get<Fortran::parser::Label>(stmt.t));
+
+    // If the address points to a boxed pointer, we need to dereference it
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(addr.getType())) {
+      if (auto boxType = mlir::dyn_cast<fir::BoxType>(refType.getEleTy())) {
+        mlir::Value boxValue = builder->create<fir::LoadOp>(loc, addr);
+        addr = builder->create<fir::BoxAddrOp>(loc, boxValue);
+      }
+    }
+
+    builder->create<fir::StoreOp>(loc, labelValue, addr);
   }
 
   void genFIR(const Fortran::parser::FormatStmt &) {
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 9689f920840f..906a68982188 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1848,8 +1848,15 @@ class HlfirBuilder {
       for (Fortran::lower::ComponentReverseIterator compIterator(
                ctor.result().derivedTypeSpec());
            !compIterator.lookup(compSym.name());) {
-        const auto &parentType = compIterator.advanceToParentType();
-        llvm::StringRef parentName = toStringRef(parentType.name());
+        // Private parent components have mangled names. Get the name from the
+        // parent symbol.
+        const Fortran::semantics::Symbol *parentCompSym =
+            compIterator.getParentComponent();
+        assert(parentCompSym && "failed to get parent component symbol");
+        std::string parentName =
+            converter.getRecordTypeFieldName(*parentCompSym);
+        // Advance the iterator, but don't use its return value.
+        compIterator.advanceToParentType();
         auto baseRecTy = mlir::cast<fir::RecordType>(
             hlfir::getFortranElementType(currentParent.getType()));
         auto parentCompType = baseRecTy.getType(parentName);
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 7a2e8e509518..0fde61465fb8 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -669,6 +669,18 @@ Fortran::lower::ComponentReverseIterator::advanceToParentType() {
   return *currentParentType;
 }
 
+const Fortran::semantics::Symbol *
+Fortran::lower::ComponentReverseIterator::getParentComponent() const {
+  if (!currentTypeDetails->GetParentComponentName())
+    return nullptr;
+  const Fortran::semantics::Scope *scope = currentParentType->GetScope();
+  auto parentComp =
+      DEREF(scope).find(currentTypeDetails->GetParentComponentName().value());
+  if (parentComp == scope->cend())
+    return nullptr;
+  return &*parentComp->second;
+}
+
 void Fortran::lower::ComponentReverseIterator::setCurrentType(
     const Fortran::semantics::DerivedTypeSpec &derived) {
   currentParentType = &derived;
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 63a612d7ead6..d64449373fca 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -464,8 +464,10 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
       fir::BoxType boxTy =
           fir::BoxType::get(fir::PointerType::get(converter.genType(s)));
       auto descFunc = [&](fir::FirOpBuilder &b) {
+        bool couldBeInEquivalence =
+            Fortran::semantics::FindEquivalenceSet(s) != nullptr;
         auto box = Fortran::lower::genInitialDataTarget(
-            converter, loc, boxTy, *expr, /*couldBeInEquivalence=*/true);
+            converter, loc, boxTy, *expr, couldBeInEquivalence);
         b.create<fir::HasValueOp>(loc, box);
       };
       builder.createGlobalConstant(loc, boxTy, mangleName, descFunc, linkOnce);
diff --git a/flang/lib/Lower/Mangler.cpp b/flang/lib/Lower/Mangler.cpp
index 1333e3fe349d..e1ae86a1b5bb 100644
--- a/flang/lib/Lower/Mangler.cpp
+++ b/flang/lib/Lower/Mangler.cpp
@@ -224,8 +224,18 @@ std::string Fortran::lower::mangle::mangleName(
       assert(paramExpr && "derived type kind param not explicit");
       std::optional<int64_t> init =
           Fortran::evaluate::ToInt64(paramValue->GetExplicit());
-      assert(init && "derived type kind param is not constant");
-      kinds.emplace_back(*init);
+      // TODO: put the assertion check back when parametrized derived types
+      // are supported:
+      // assert(init && "derived type kind param is not constant");
+      //
+      // The init parameter above will require a FoldingContext for proper
+      // expression evaluation to an integer constant, otherwise the
+      // compiler may crash here (see example in issue #127424).
+      if (!init) {
+        TODO_NOLOC("parameterized derived types");
+      } else {
+        kinds.emplace_back(*init);
+      }
     }
   }
   return fir::NameUniquer::doType(modules, procs, blockId, symbolName, kinds);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 74087d42a8e6..40f735c33cc3 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1116,11 +1116,12 @@ bool ClauseProcessor::processInReduction(
         collectReductionSyms(clause, inReductionSyms);
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            inReductionVars, inReduceVarByRef, inReductionDeclSymbols,
-            inReductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                inReductionVars, inReduceVarByRef, inReductionDeclSymbols,
+                inReductionSyms))
+          inReductionSyms.clear();
 
         // Copy local lists into the output.
         llvm::copy(inReductionVars, std::back_inserter(result.inReductionVars));
@@ -1461,10 +1462,12 @@ bool ClauseProcessor::processReduction(
         }
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            reductionVars, reduceVarByRef, reductionDeclSymbols, reductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                reductionVars, reduceVarByRef, reductionDeclSymbols,
+                reductionSyms))
+          reductionSyms.clear();
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
         llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
@@ -1486,11 +1489,12 @@ bool ClauseProcessor::processTaskReduction(
         collectReductionSyms(clause, taskReductionSyms);
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols,
-            taskReductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols,
+                taskReductionSyms))
+          taskReductionSyms.clear();
         // Copy local lists into the output.
         llvm::copy(taskReductionVars,
                    std::back_inserter(result.taskReductionVars));
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index c0be1e229f82..f58a103b0347 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -39,7 +39,7 @@ namespace lower {
 namespace omp {
 
 // explicit template declarations
-template void ReductionProcessor::processReductionArguments<
+template bool ReductionProcessor::processReductionArguments<
     mlir::omp::DeclareReductionOp, omp::clause::ReductionOperatorList>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const omp::clause::ReductionOperatorList &redOperatorList,
@@ -48,7 +48,7 @@ template void ReductionProcessor::processReductionArguments<
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
 
-template void ReductionProcessor::processReductionArguments<
+template bool ReductionProcessor::processReductionArguments<
     fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const llvm::SmallVector<fir::ReduceOperationEnum> &redOperatorList,
@@ -606,7 +606,7 @@ static bool doReductionByRef(mlir::Value reductionVar) {
 }
 
 template <typename OpType, typename RedOperatorListTy>
-void ReductionProcessor::processReductionArguments(
+bool ReductionProcessor::processReductionArguments(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const RedOperatorListTy &redOperatorList,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
@@ -626,10 +626,10 @@ void ReductionProcessor::processReductionArguments(
               std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic)) {
-          return;
+          return false;
         }
       } else {
-        return;
+        return false;
       }
     }
   }
@@ -764,6 +764,8 @@ void ReductionProcessor::processReductionArguments(
 
   if (isDoConcurrent)
     builder.restoreInsertionPoint(dcIP);
+
+  return true;
 }
 
 const semantics::SourceName
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 2425265e196c..783aef18fb89 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -16,7 +16,6 @@
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
 #include "flang/Common/visit.h"
-#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
@@ -4117,21 +4116,26 @@ void OmpStructureChecker::CheckArraySection(
   // Detect this by looking for array accesses on character variables which are
   // not arrays.
   bool isSubstring{false};
-  evaluate::ExpressionAnalyzer ea{context_};
-  if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
-    std::optional<evaluate::Shape> shape = evaluate::GetShape(expr);
-    // Not an array: rank 0
-    if (shape && shape->size() == 0) {
-      if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
-        if (type->category() == evaluate::TypeCategory::Character) {
-          // Substrings are explicitly denied by the standard [6.0:163:9-11].
-          // This is supported as an extension. This restriction was added in
-          // OpenMP 5.2.
-          isSubstring = true;
-          context_.Say(GetContext().clauseSource,
-              "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
-        } else {
-          llvm_unreachable("Array indexing on a variable that isn't an array");
+  // Cannot analyze a base of an assumed-size array on its own. If we know
+  // this is an array (assumed-size or not) we can ignore it, since we're
+  // looking for strings.
+  if (!IsAssumedSizeArray(*name.symbol)) {
+    evaluate::ExpressionAnalyzer ea{context_};
+    if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
+      if (expr->Rank() == 0) {
+        // Not an array: rank 0
+        if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
+          if (type->category() == evaluate::TypeCategory::Character) {
+            // Substrings are explicitly denied by the standard [6.0:163:9-11].
+            // This is supported as an extension. This restriction was added in
+            // OpenMP 5.2.
+            isSubstring = true;
+            context_.Say(GetContext().clauseSource,
+                "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
+          } else {
+            llvm_unreachable(
+                "Array indexing on a variable that isn't an array");
+          }
         }
       }
     }
diff --git a/flang/test/Driver/loop-interchange.f90 b/flang/test/Driver/loop-interchange.f90
index 5d3ec71c5987..1e5a11902709 100644
--- a/flang/test/Driver/loop-interchange.f90
+++ b/flang/test/Driver/loop-interchange.f90
@@ -2,9 +2,9 @@
 ! RUN: %flang -### -S -fno-loop-interchange %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! CHECK-LOOP-INTERCHANGE: "-floop-interchange"
 ! CHECK-NO-LOOP-INTERCHANGE-NOT: "-floop-interchange"
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 5a3fd0d83800..e7da964184c8 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -44,6 +44,10 @@
 ! RUN: %flang --target=loongarch64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-LOONGARCH64
 
+! RUN: %flang --target=sparc64-linux-gnu -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-freebsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-openbsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-A57-SAME: "-target-cpu" "cortex-a57"
 ! CHECK-A57-SAME: "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2
@@ -92,3 +96,6 @@
 
 ! CHECK-LOONGARCH64: "-fc1" "-triple" "loongarch64-unknown-linux-gnu"
 ! CHECK-LOONGARCH64-SAME: "-target-cpu" "loongarch64" "-target-feature" "+lsx" "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+ual"
+
+! CHECK-SPARC-VIS: "-fc1" "-triple" "sparc64-{{[^"]+}}"
+! CHECK-SPARC-VIS-SAME: "-target-feature" "+vis"
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90
new file mode 100644
index 000000000000..70b2ae1111a5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90
@@ -0,0 +1,25 @@
+! Tests reduction processor behavior when a reduction symbol is not supported.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+subroutine foo
+  implicit none
+  integer :: k, i
+
+  interface max
+    function max(m1,m2)
+      integer :: m1, m2
+    end function
+  end interface
+
+  !$omp do reduction (max: k)
+  do i=1,10
+  end do
+  !$omp end do
+end
+
+! Verify that unsupported reduction is ignored.
+! CHECK: omp.wsloop 
+! CHECK-SAME: private(@{{[^[:space:]]+}} %{{[^[:space:]]+}}
+! CHECK-SAME:         -> %{{[^[:space:]]+}} : !{{[^[:space:]]+}}) {
+! CHECK: }
diff --git a/flang/test/Lower/assign-statement.f90 b/flang/test/Lower/assign-statement.f90
new file mode 100644
index 000000000000..342355bf469c
--- /dev/null
+++ b/flang/test/Lower/assign-statement.f90
@@ -0,0 +1,12 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+  ! CHECK-LABEL: func @_QQmain
+  program main
+  integer :: ip
+  pointer :: ip
+
+  allocate(ip)
+  assign 10 to ip
+  ! CHECK: fir.store %c10_i32 to %11 : !fir.ptr<i32>
+  10 return
+  end program main
diff --git a/flang/test/Lower/derived-type-private.f90 b/flang/test/Lower/derived-type-private.f90
new file mode 100644
index 000000000000..8edcdeedad8b
--- /dev/null
+++ b/flang/test/Lower/derived-type-private.f90
@@ -0,0 +1,29 @@
+! Test lowering of derived type with private attribute
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+program main
+  call test02()
+  print *,"pass"
+end program main
+
+module mod2
+  type,private:: tt
+     integer :: ip = 1
+  end type tt
+  type,extends(tt):: ty1
+  ! CHECK: fir.global @_QMmod2Estr : !fir.type<_QMmod2Tty1{_QMmod2Tty1.tt:!fir.type<_QMmod2Ttt{ip:i32}>,i1:i32,i1p:!fir.type<_QMmod2Ttt{ip:i32}>,i1a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
+     integer :: i1 = 1
+     type(tt) :: i1p = tt(2)
+     integer,allocatable :: i1a(:)
+  end type ty1
+  type(ty1) :: str
+end module mod2
+
+subroutine test02()
+  use mod2
+  integer,allocatable :: ia(:)
+  allocate(ia(10))
+  ia=2
+  str=ty1(i1a=ia)
+  if (str%i1.ne.1) print *,'ng'
+end subroutine test02
diff --git a/flang/test/Lower/equivalence-3.f b/flang/test/Lower/equivalence-3.f
new file mode 100644
index 000000000000..19f8880189f0
--- /dev/null
+++ b/flang/test/Lower/equivalence-3.f
@@ -0,0 +1,19 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+  ! CHECK-LABEL: func @_QQmain
+  program main
+  real a1,a2
+  equivalence (a1,a2)
+  ! A fir.alloca should never appear in a global constant initialization.
+  ! CHECK: fir.global linkonce @_QFEx1.desc constant : !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  ! CHECK: arith.constant 5 : index
+  ! CHECK:fir.address_of(@_QFEx1) : !fir.ref<!fir.array<5xf64>>
+  ! CHECK: fir.shape %c5 : (index) -> !fir.shape<1>
+  ! CHECK: fir.declare %0(%1) {uniq_name = "_QFEx1"} : (!fir.ref<!fir.array<5xf64>>, !fir.shape<1>) -> !fir.ref<!fir.array<5xf64>>
+  ! CHECK: fir.embox %2(%1) : (!fir.ref<!fir.array<5xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf64>>
+  ! CHECK: fir.rebox %3 : (!fir.box<!fir.array<5xf64>>) -> !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  ! CHECK: fir.has_value %4 : !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  real*8 x1(5)
+  namelist /y1/x1
+  read (5,y1)
+  end
diff --git a/flang/test/Lower/parametrized-derived-types.f90 b/flang/test/Lower/parametrized-derived-types.f90
new file mode 100644
index 000000000000..97a40c9169d2
--- /dev/null
+++ b/flang/test/Lower/parametrized-derived-types.f90
@@ -0,0 +1,19 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! XFAIL: *
+program main
+  TYPE ty(k1,k2)
+     INTEGER ,KIND::k1,k2=5
+     INTEGER::arr(k1:k2)=10
+     CHARACTER(LEN=k2)::CHARACTER
+  END TYPE ty
+  TYPE,EXTENDS(ty)::ty1(k3)
+     INTEGER,KIND ::k3=4
+     TYPE(ty(2,k3+1))::cmp_ty = ty(2,k3+1)(55,'HI')
+  END TYPE ty1
+  TYPE ty2(l1, l2)
+  !ERROR: not yet implemented: parameterized derived types
+     INTEGER,LEN ::l1,l2
+     TYPE(ty1(2,5)), ALLOCATABLE::ty1_cmp(:)
+  END TYPE ty2
+  TYPE(ty2(4,8)) ::ty2_obj
+end program main
diff --git a/flang/test/Semantics/OpenMP/assumed-size-array.f90 b/flang/test/Semantics/OpenMP/assumed-size-array.f90
new file mode 100644
index 000000000000..6e36db178dc8
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/assumed-size-array.f90
@@ -0,0 +1,25 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! This should compile without errors. Check for a symptom of a reasonable
+! output.
+
+!CHECK: omp.task depend
+
+subroutine omp_task_depend_reproducer(work, myid, shift)
+  implicit none
+  integer, intent(in) :: myid, shift
+  real, intent(inout) :: work(*)
+
+!$omp parallel shared(work, myid, shift)
+  !$omp single
+    !$omp task depend(in:work(myid+shift-1)) depend(in:work(myid-1)) depend(out:work(myid))
+      call dummy_kernel(work(myid))
+    !$omp end task
+  !$omp end single
+!$omp end parallel
+contains
+  subroutine dummy_kernel(x)
+    real :: x
+    x = x + 1.0
+  end subroutine dummy_kernel
+end subroutine omp_task_depend_reproducer
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 469266cc8155..7516157731b5 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -30,6 +30,11 @@ target_link_libraries(bbc PRIVATE
   flangFrontend
   flangPasses
   FlangOpenMPTransforms
+  FortranSupport
+  FortranParser
+  FortranEvaluate
+  FortranSemantics
+  FortranLower
 )
 
 mlir_target_link_libraries(bbc PRIVATE
@@ -37,9 +42,4 @@ mlir_target_link_libraries(bbc PRIVATE
   ${extension_libs}
   MLIRAffineToStandard
   MLIRSCFToControlFlow
-  FortranSupport
-  FortranParser
-  FortranEvaluate
-  FortranSemantics
-  FortranLower
 )
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2570d1a106d2..c98e2043464d 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -84,9 +84,6 @@ else()
   endif()
 endif()
 
-# Setup the paths where libclc runtimes should be stored.
-set( LIBCLC_OUTPUT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} )
-
 if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
   message( WARNING "Using custom LLVM tools to build libclc: "
     "${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR}, "
@@ -164,14 +161,34 @@ endif()
 
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
+# Construct LLVM version define
+set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
+
 # This needs to be set before any target that needs it
 # We need to use LLVM_INCLUDE_DIRS here, because if we are linking to an
 # llvm build directory, this includes $src/llvm/include which is where all the
 # headers are not $build/include/ which is what LLVM_INCLUDE_DIR is set to.
 include_directories( ${LLVM_INCLUDE_DIRS} )
 
-# Configure prepare_builtins
-add_subdirectory( utils )
+# Setup prepare_builtins tools
+set(LLVM_LINK_COMPONENTS
+  BitReader
+  BitWriter
+  Core
+  IRReader
+  Support
+)
+if( LIBCLC_STANDALONE_BUILD )
+  add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
+  set( prepare_builtins_exe prepare_builtins )
+  set( prepare_builtins_target prepare_builtins )
+else()
+  add_llvm_utility( prepare_builtins utils/prepare-builtins.cpp )
+  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
+endif()
+target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
+# These were not properly reported in early LLVM and we don't need them
+target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
 
 # Setup arch devices
 set( r600--_devices cedar cypress barts cayman )
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 056706ee629c..597bb642655e 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -120,14 +120,14 @@ function(link_bc)
   endif()
 
   add_custom_command(
-    OUTPUT ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc
-    COMMAND ${llvm-link_exe} ${link_flags} -o ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc ${LINK_INPUT_ARG}
+    OUTPUT ${ARG_TARGET}.bc
+    COMMAND ${llvm-link_exe} ${link_flags} -o ${ARG_TARGET}.bc ${LINK_INPUT_ARG}
     DEPENDS ${llvm-link_target} ${ARG_DEPENDENCIES} ${ARG_INPUTS} ${RSP_FILE}
   )
 
-  add_custom_target( ${ARG_TARGET} ALL DEPENDS ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc )
+  add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
   set_target_properties( ${ARG_TARGET} PROPERTIES
-    TARGET_FILE ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${ARG_TARGET}.bc
     FOLDER "libclc/Device IR/Linking"
   )
 endfunction()
@@ -356,98 +356,65 @@ function(add_libclc_builtin_set)
 
   set( builtins_link_lib $<TARGET_PROPERTY:${builtins_link_lib_tgt},TARGET_FILE> )
 
-  # For SPIR-V targets we diverage at this point and generate SPIR-V using the
-  # llvm-spirv tool.
   if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
-    set( obj_suffix ${ARG_ARCH_SUFFIX}.spv )
-    set( libclc_builtins_lib ${LIBCLC_OUTPUT_LIBRARY_DIR}/${obj_suffix} )
-    add_custom_command( OUTPUT ${libclc_builtins_lib}
-      COMMAND ${llvm-spirv_exe} ${spvflags} -o ${libclc_builtins_lib} ${builtins_link_lib}
+    set( spv_suffix ${ARG_ARCH_SUFFIX}.spv )
+    add_custom_command( OUTPUT ${spv_suffix}
+      COMMAND ${llvm-spirv_exe} ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
       DEPENDS ${llvm-spirv_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
     )
-  else()
-    # Non-SPIR-V targets add an extra step to optimize the bytecode
-    set( builtins_opt_lib_tgt builtins.opt.${ARG_ARCH_SUFFIX} )
+    add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
+    set_target_properties( "prepare-${spv_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
+       DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
 
-    add_custom_command( OUTPUT ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-      COMMAND ${opt_exe} ${ARG_OPT_FLAGS} -o ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-        ${builtins_link_lib}
-      DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
-    )
-    add_custom_target( ${builtins_opt_lib_tgt}
-      ALL DEPENDS ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-    )
-    set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
-      TARGET_FILE ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-      FOLDER "libclc/Device IR/Opt"
-    )
-
-    set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
-
-    set( obj_suffix ${ARG_ARCH_SUFFIX}.bc )
-    set( libclc_builtins_lib ${LIBCLC_OUTPUT_LIBRARY_DIR}/${obj_suffix} )
-    add_custom_command( OUTPUT ${libclc_builtins_lib}
-      COMMAND ${prepare_builtins_exe} -o ${libclc_builtins_lib} ${builtins_opt_lib}
-      DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target}
-    )
+    return()
   endif()
 
-  # Add a 'prepare' target
-  add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${libclc_builtins_lib} )
-  set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
-
-  # Also add a 'prepare' target for the triple. Since a triple may have
-  # multiple devices, ensure we only try to create the triple target once. The
-  # triple's target will build all of the bytecode for its constituent devices.
-  if( NOT TARGET prepare-${ARG_TRIPLE} )
-    add_custom_target( prepare-${ARG_TRIPLE} ALL )
-  endif()
-  add_dependencies( prepare-${ARG_TRIPLE} prepare-${obj_suffix} )
+  set( builtins_opt_lib_tgt builtins.opt.${ARG_ARCH_SUFFIX} )
 
-  install(
-    FILES ${libclc_builtins_lib}
-    DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+  # Add opt target
+  add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
+    COMMAND ${opt_exe} ${ARG_OPT_FLAGS} -o ${builtins_opt_lib_tgt}.bc
+      ${builtins_link_lib}
+    DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
+  )
+  add_custom_target( ${builtins_opt_lib_tgt}
+    ALL DEPENDS ${builtins_opt_lib_tgt}.bc
+  )
+  set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${builtins_opt_lib_tgt}.bc
+    FOLDER "libclc/Device IR/Opt"
   )
 
-  # SPIR-V targets can exit early here
-  if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
-    return()
-  endif()
+  set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
 
-  # Add a test for whether or not the libraries contain unresolved calls which
-  # would usually indicate a build problem. Note that we don't perform this
-  # test for all libclc targets:
-  # * nvptx-- targets don't include workitem builtins
-  # * clspv targets don't include all OpenCL builtins
+  # Add prepare target
+  set( obj_suffix ${ARG_ARCH_SUFFIX}.bc )
+  add_custom_command( OUTPUT ${obj_suffix}
+    COMMAND ${prepare_builtins_exe} -o ${obj_suffix} ${builtins_opt_lib}
+    DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target} )
+  add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
+  set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+
+  # nvptx-- targets don't include workitem builtins, and clspv targets don't
+  # include all OpenCL builtins
   if( NOT ARG_ARCH MATCHES "^(nvptx|clspv)(64)?$" )
     add_test( NAME external-calls-${obj_suffix}
-      COMMAND ./check_external_calls.sh ${libclc_builtins_lib} ${LLVM_TOOLS_BINARY_DIR}
+      COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
   endif()
 
-  if(CMAKE_HOST_UNIX OR LLVM_USE_SYMLINKS)
-    set(LIBCLC_LINK_OR_COPY create_symlink)
-  else()
-    set(LIBCLC_LINK_OR_COPY copy)
-  endif()
-
-  foreach( a IN LISTS ARG_ALIASES )
+  install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+  foreach( a ${ARG_ALIASES} )
     set( alias_suffix "${a}-${ARG_TRIPLE}.bc" )
     add_custom_command(
-      OUTPUT ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      COMMAND ${CMAKE_COMMAND} -E ${LIBCLC_LINK_OR_COPY} ${libclc_builtins_lib} ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      DEPENDS prepare-${obj_suffix}
-    )
-    add_custom_target( alias-${alias_suffix} ALL
-      DEPENDS ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-    )
-    set_target_properties( alias-${alias_suffix}
-      PROPERTIES FOLDER "libclc/Device IR/Aliases"
-    )
-    install(
-      FILES ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
-    )
+      OUTPUT ${alias_suffix}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${obj_suffix} ${alias_suffix}
+      DEPENDS prepare-${obj_suffix} )
+    add_custom_target( alias-${alias_suffix} ALL DEPENDS ${alias_suffix} )
+    set_target_properties( alias-${alias_suffix} PROPERTIES FOLDER "libclc/Device IR/Aliases" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix}
+             DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
   endforeach( a )
 endfunction(add_libclc_builtin_set)
 
diff --git a/libclc/utils/CMakeLists.txt b/libclc/utils/CMakeLists.txt
deleted file mode 100644
index ea1d9e9c8ef5..000000000000
--- a/libclc/utils/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# Construct LLVM version define
-set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
-
-# Setup prepare_builtins tools
-set( LLVM_LINK_COMPONENTS
-  BitReader
-  BitWriter
-  Core
-  IRReader
-  Support
-)
-
-if( LIBCLC_STANDALONE_BUILD )
-  add_llvm_executable( prepare_builtins prepare-builtins.cpp )
-  set( prepare_builtins_exe prepare_builtins )
-  set( prepare_builtins_target prepare_builtins )
-else()
-  add_llvm_utility( prepare_builtins prepare-builtins.cpp )
-  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
-endif()
-
-target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
-# These were not properly reported in early LLVM and we don't need them
-target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake
new file mode 100644
index 000000000000..c843c02977a8
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake
@@ -0,0 +1,2 @@
+set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "assertion_semantic=observe" CACHE STRING "")
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 17808841bd9e..1cdb3605c38a 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -39,6 +39,8 @@ modes are:
 
    Enabling hardening has no impact on the ABI.
 
+.. _notes-for-users:
+
 Notes for users
 ---------------
 
@@ -72,6 +74,10 @@ to control the level by passing **one** of the following options to the compiler
    pre-built components. Most libc++ code is header-based, so a user-provided
    value for ``_LIBCPP_HARDENING_MODE`` will be mostly respected.
 
+In some cases, users might want to override the assertion semantic used by the
+library. This can be done similarly to setting the hardening mode; please refer
+to the :ref:`relevant section <assertion-semantics>`.
+
 Notes for vendors
 -----------------
 
@@ -260,6 +266,68 @@ output. This is less secure and increases the size of the binary (among other
 things, it has to store the error message strings) but makes the failure easier
 to debug. It also allows testing the error messages in our test suite.
 
+This default behavior can be customized by users via :ref:`assertion semantics
+<assertion-semantics>`; it can also be completely overridden by vendors by
+providing a :ref:`custom assertion failure handler
+<override-assertion-handler>`.
+
+.. _assertion-semantics:
+
+Assertion semantics
+-------------------
+
+.. warning::
+
+  Assertion semantics are currently an experimental feature.
+
+.. note::
+
+  Assertion semantics are not available in the C++03 mode.
+
+What happens when an assertion fails depends on the assertion semantic being
+used. Four assertion semantics are available, based on C++26 Contracts
+evaluation semantics:
+
+- ``ignore`` evaluates the assertion but has no effect if it fails (note that it
+  differs from the Contracts ``ignore`` semantic which would not evaluate
+  the assertion at all);
+- ``observe`` logs an error (indicating, if possible on the platform, that the
+  error is fatal) but continues execution;
+- ``quick-enforce`` terminates the program as fast as possible via a trap
+  instruction. It is the default semantic for the production modes (``fast`` and
+  ``extensive``);
+- ``enforce`` logs an error and then terminates the program. It is the default
+  semantic for the ``debug`` mode.
+
+Notes:
+
+- Continuing execution after a hardening check fails results in undefined
+  behavior; the ``observe`` semantic is meant to make adopting hardening easier
+  but should not be used outside of the adoption period;
+- C++26 wording for Library Hardening precludes a conforming Hardened
+  implementation from using the Contracts ``ignore`` semantic when evaluating
+  hardened preconditions in the Library. Libc++ allows using this semantic for
+  hardened preconditions, but please be aware that using ``ignore`` does not
+  produce a conforming "Hardened" implementation, unlike the other semantics
+  above.
+
+The default assertion semantics are as follows:
+
+- ``fast``: ``quick-enforce``;
+- ``extensive``: ``quick-enforce``;
+- ``debug``: ``enforce``.
+
+The default assertion semantics can be overridden by passing **one** of the
+following options to the compiler:
+
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_IGNORE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_OBSERVE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_ENFORCE``
+
+All the :ref:`same notes <notes-for-users>` apply to setting this macro as for
+setting ``_LIBCPP_HARDENING_MODE``.
+
 .. _override-assertion-handler:
 
 Overriding the assertion failure handler
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6f18b61284f4..1410223d56a6 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -10,7 +10,7 @@ Written by the `Libc++ Team <https://libcxx.llvm.org>`_
 
 .. warning::
 
-   These are in-progress notes for the upcoming libc++ 20.0.0 release.
+   These are in-progress notes for the upcoming libc++ 21.0.0 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
 
@@ -18,7 +18,7 @@ Introduction
 ============
 
 This document contains the release notes for the libc++ C++ Standard Library,
-part of the LLVM Compiler Infrastructure, release 20.0.0. Here we describe the
+part of the LLVM Compiler Infrastructure, release 21.0.0. Here we describe the
 status of libc++ in some detail, including major improvements from the previous
 release and new feature work. For the general LLVM release notes, see `the LLVM
 documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM releases may
@@ -88,6 +88,12 @@ Improvements and New Features
 
 - ``ctype::tolower`` and ``ctype::toupper`` have been optimized, resulting in a 2x performance improvement.
 
+- As an experimental feature, Hardening now supports assertion semantics that allow customizing how a hardening
+  assertion failure is handled. The four available semantics, modeled on C++26 Contracts, are ``ignore``, ``observe``,
+  ``quick-enforce`` and ``enforce``. The ``observe`` semantic is intended to make it easier to adopt Hardening in
+  production but should not be used outside of this scenario. Please refer to the :ref:`Hardening documentation
+  <hardening>` for details.
+
 Deprecations and Removals
 -------------------------
 
@@ -140,6 +146,21 @@ ABI Affecting Changes
   comparison between shared libraries, since all RTTI has the correct visibility now. There is no behaviour change on
   Clang.
 
+- LLVM 20 contained an ABI break that can result in the size of ``std::unordered_{map,set,multimap,multiset}`` and ``std::deque`` changing when used with an allocator type that is empty and contains a base class that is the same across rebound allocator instantiations (e.g. ``Allocator<int>`` and ``Allocator<char>`` are both empty and contain the same base class).
+  In addition, the layout of a user-defined type that:
+
+    - contains one of the following containers: ``std::unordered_{map,set,multimap,multiset}``, ``std::deque``, ``std::map``, ``std::set``, ``std::multimap``, ``std::multiset``, ``std::list`` or ``std::vector``, and
+    - passes an empty allocator, comparator or hasher type to that container, and
+    - has a member of that same empty allocator, comparator or hasher type inside the enclosing struct, and
+    - that member is either marked with ``[[no_unique_address]]`` or optimized out via the EBO (empty base optimization) technique
+
+  saw its size increase from LLVM 19 to LLVM 20. This was caused by the usage of ``[[no_unique_address]]`` within some of libc++'s containers in a way that allowed subtle interactions with enclosing objects. This is fixed in LLVM 21 when using the Clang compiler (returning to the LLVM 19 ABI), however that implies an ABI break from LLVM 20 to LLVM 21.
+
+  Furthermore, fixing this causes a slight regression to constant evaluation support in ``std::unique_ptr``. Specifically, constant evaluation will now fail when the deleter relies on being value-initialized for constant-evaluation admissibility. If a default-initialized deleter can be used during constant evaluation, or if the default constructor is non-trivial, the ``unique_ptr`` is not affected by this regression. In particular, this regression does not impact any ``unique_ptr`` using the default deleter.
+
+  Note that there is currently no way to realistically fix this ABI break on GCC, therefore GCC will remain on the ABI introduced in LLVM 19. That also means that Clang and GCC will have a slightly different ABI for the small subset of types listed above until we are able to apply the same fix we did with Clang on GCC.
+
+  For more details see https://llvm.org/PR154146.
 
 Build System Changes
 --------------------
diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index 79f5908bdc6b..415a59916837 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -72,6 +72,11 @@ when ``-fexperimental-library`` is passed:
 * ``std::chrono::tzdb`` and related time zone functionality
 * ``<syncstream>``
 
+Additionally, assertion semantics are an experimental feature that can be used
+to customize the behavior of Hardening (see :ref:`here <assertion-semantics>`).
+Assertion semantics mirror the evaluation semantics of C++26 Contracts but are
+not a standard feature.
+
 .. note::
   Experimental libraries are experimental.
     * The contents of the ``<experimental/...>`` headers and the associated static
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 4f2a8dddad92..85758c671e1e 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -535,6 +535,7 @@ set(files
   __locale_dir/time.h
   __locale_dir/wbuffer_convert.h
   __locale_dir/wstring_convert.h
+  __log_hardening_failure
   __math/abs.h
   __math/copysign.h
   __math/error_functions.h
diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 90eaa6023587..a9451daf47f2 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
-             expression) " failed: " message "\n"))
+       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(                                                      \
+             __LINE__) ": libc++ Hardening assertion " _LIBCPP_TOSTRING(expression) " failed: " message "\n"))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/include/__config b/libcxx/include/__config
index d940461c3023..fd15e07c5ef7 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -28,7 +28,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 210000
+#  define _LIBCPP_VERSION 210100
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
@@ -38,11 +38,47 @@
 #    define _LIBCPP_FREESTANDING
 #  endif
 
+// NOLINTNEXTLINE(libcpp-cpp-version-check)
+#  if __cplusplus < 201103L
+#    define _LIBCPP_CXX03_LANG
+#  endif
+
+#  if __has_feature(experimental_library)
+#    ifndef _LIBCPP_ENABLE_EXPERIMENTAL
+#      define _LIBCPP_ENABLE_EXPERIMENTAL
+#    endif
+#  endif
+
+// Incomplete features get their own specific disabling flags. This makes it
+// easier to grep for target specific flags once the feature is complete.
+#  if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
+#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
+#  else
+#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
+#  endif
+
+#  define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+
 // HARDENING {
 
-// TODO: Remove in LLVM 21. We're making this an error to catch folks who might not have migrated.
-#  ifdef _LIBCPP_ENABLE_ASSERTIONS
-#    error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE instead"
+// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated.
+//       Since hardening went through several changes (many of which impacted user-facing macros),
+//       we're keeping these checks around for a bit longer than usual. Failure to properly configure
+//       hardening results in checks being dropped silently, which is a pretty big deal.
+#  if defined(_LIBCPP_ENABLE_ASSERTIONS)
+#    error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_HARDENED_MODE)
+#    error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_SAFE_MODE)
+#    error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_DEBUG_MODE)
+#    error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
 #  endif
 
 // The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values:
@@ -147,16 +183,53 @@ _LIBCPP_HARDENING_MODE_EXTENSIVE, \
 _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
+// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts:
+// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts
+//   `ignore` semantic which wouldn't evaluate the assertion at all);
+// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution;
+// - `quick-enforce` terminates the program as fast as possible (via trapping);
+// - `enforce` logs an error and then terminates the program.
+//
+// Notes:
+// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant
+//   to make adopting hardening easier but should not be used outside of this scenario;
+// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts
+//   `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for
+//   hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened"
+//   implementation, unlike the other semantics above.
+// clang-format off
+#  define _LIBCPP_ASSERTION_SEMANTIC_IGNORE        (1 << 1)
+#  define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE       (1 << 2)
+#  define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3)
+#  define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE       (1 << 4)
+// clang-format on
+
+// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics.
+// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use
+// `enforce` (i.e., log and abort).
+#  ifndef _LIBCPP_ASSERTION_SEMANTIC
+
+#    if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+#    else
+#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#    endif
+
+#  else
+#    if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#      error "Assertion semantics are an experimental feature."
+#    endif
+#    if defined(_LIBCPP_CXX03_LANG)
+#      error "Assertion semantics are not available in the C++03 mode."
+#    endif
+
+#  endif // _LIBCPP_ASSERTION_SEMANTIC
+
 // } HARDENING
 
 #  define _LIBCPP_TOSTRING2(x) #x
 #  define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x)
 
-// NOLINTNEXTLINE(libcpp-cpp-version-check)
-#  if __cplusplus < 201103L
-#    define _LIBCPP_CXX03_LANG
-#  endif
-
 #  ifndef __has_constexpr_builtin
 #    define __has_constexpr_builtin(x) 0
 #  endif
@@ -190,24 +263,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ABI_VCRUNTIME
 #  endif
 
-#  if __has_feature(experimental_library)
-#    ifndef _LIBCPP_ENABLE_EXPERIMENTAL
-#      define _LIBCPP_ENABLE_EXPERIMENTAL
-#    endif
-#  endif
-
-// Incomplete features get their own specific disabling flags. This makes it
-// easier to grep for target specific flags once the feature is complete.
-#  if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
-#  else
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
-#  endif
-
-#  define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-
 #  if defined(__MVS__)
 #    include <features.h> // for __NATIVE_ASCII_F
 #  endif
diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config
index ef47327d9635..9b88a495055e 100644
--- a/libcxx/include/__cxx03/__config
+++ b/libcxx/include/__cxx03/__config
@@ -152,6 +152,10 @@ _LIBCPP_HARDENING_MODE_EXTENSIVE, \
 _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
+#  ifdef _LIBCPP_ASSERTION_SEMANTIC
+#    error "Assertion semantics are not available in the C++03 mode."
+#  endif
+
 // } HARDENING
 
 #  define _LIBCPP_TOSTRING2(x) #x
diff --git a/libcxx/include/__log_hardening_failure b/libcxx/include/__log_hardening_failure
new file mode 100644
index 000000000000..d1805306f6b6
--- /dev/null
+++ b/libcxx/include/__log_hardening_failure
@@ -0,0 +1,42 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOG_HARDENING_FAILURE
+#define _LIBCPP___LOG_HARDENING_FAILURE
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+// Hardening logging is not available in the C++03 mode; moreover, it is currently only available in the experimental
+// library.
+#if _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC && !defined(_LIBCPP_CXX03_LANG)
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// This function should never be called directly from the code -- it should only be called through the
+// `_LIBCPP_LOG_HARDENING_FAILURE` macro.
+[[__gnu__::__cold__]] _LIBCPP_EXPORTED_FROM_ABI void __log_hardening_failure(const char* __message) noexcept;
+
+// _LIBCPP_LOG_HARDENING_FAILURE(message)
+//
+// This macro is used to log an error without terminating the program (as is the case for hardening failures if the
+// `observe` assertion semantic is used).
+
+#  if !defined(_LIBCPP_LOG_HARDENING_FAILURE)
+#    define _LIBCPP_LOG_HARDENING_FAILURE(__message) ::std::__log_hardening_failure(__message)
+#  endif // !defined(_LIBCPP_LOG_HARDENING_FAILURE)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC && !defined(_LIBCPP_CXX03_LANG)
+
+#endif // _LIBCPP___LOG_HARDENING_FAILURE
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index fb7b7b7afcc8..29e503931b0b 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -80,21 +80,45 @@ class __compressed_pair_padding {
 template <class _ToPad>
 class __compressed_pair_padding<_ToPad, true> {};
 
-#  define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                  \
-    _LIBCPP_NO_UNIQUE_ADDRESS __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;    \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _)
-
-#  define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                              \
-    _LIBCPP_NO_UNIQUE_ADDRESS                                                                                          \
-    __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                                \
-                   __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                              \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _)
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef _LIBCPP_COMPILER_GCC
+#    define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                \
+      _LIBCPP_NO_UNIQUE_ADDRESS __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;  \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _)
+
+#    define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                            \
+      _LIBCPP_NO_UNIQUE_ADDRESS                                                                                        \
+      __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                              \
+                     __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                            \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _)
+#  else
+#    define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                \
+      struct {                                                                                                         \
+        _LIBCPP_NO_UNIQUE_ADDRESS                                                                                      \
+        __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;                          \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);      \
+      }
+
+#    define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                            \
+      struct {                                                                                                         \
+        _LIBCPP_NO_UNIQUE_ADDRESS                                                                                      \
+        __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                            \
+                       __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                          \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _);      \
+      }
+#  endif
 
 #else
 #  define _LIBCPP_COMPRESSED_PAIR(T1, Name1, T2, Name2)                                                                \
diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h
index 5ff2efbe5faa..3f5626c01443 100644
--- a/libcxx/include/__type_traits/invoke.h
+++ b/libcxx/include/__type_traits/invoke.h
@@ -67,20 +67,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if __has_builtin(__builtin_invoke)
 
-template <class... _Args>
-using __invoke_result_t _LIBCPP_NODEBUG = decltype(__builtin_invoke(std::declval<_Args>()...));
-
 template <class, class... _Args>
 struct __invoke_result_impl {};
 
 template <class... _Args>
-struct __invoke_result_impl<__void_t<__invoke_result_t<_Args...> >, _Args...> {
-  using type _LIBCPP_NODEBUG = __invoke_result_t<_Args...>;
+struct __invoke_result_impl<__void_t<decltype(__builtin_invoke(std::declval<_Args>()...))>, _Args...> {
+  using type _LIBCPP_NODEBUG = decltype(__builtin_invoke(std::declval<_Args>()...));
 };
 
 template <class... _Args>
 using __invoke_result _LIBCPP_NODEBUG = __invoke_result_impl<void, _Args...>;
 
+template <class... _Args>
+using __invoke_result_t _LIBCPP_NODEBUG = typename __invoke_result<_Args...>::type;
+
 template <class... _Args>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __invoke_result_t<_Args...> __invoke(_Args&&... __args)
     _NOEXCEPT_(noexcept(__builtin_invoke(std::forward<_Args>(__args)...))) {
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index d6b92204f437..46815eaffa8b 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -744,7 +744,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -831,7 +831,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set
index 7fd5df24ed3a..62a7a0dbcffb 100644
--- a/libcxx/include/ext/hash_set
+++ b/libcxx/include/ext/hash_set
@@ -458,7 +458,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -543,7 +543,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Value, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/map b/libcxx/include/map
index 225156580147..3d88b32dd426 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -692,12 +692,12 @@ public:
 #  if _LIBCPP_STD_VER >= 14
   template <typename _K2>
   _LIBCPP_HIDE_FROM_ABI bool operator()(const _K2& __x, const _CP& __y) const {
-    return __comp_(__x, __y.__get_value().first);
+    return __comp_(__x, __y.first);
   }
 
   template <typename _K2>
   _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _K2& __y) const {
-    return __comp_(__x.__get_value().first, __y);
+    return __comp_(__x.first, __y);
   }
 #  endif
 };
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 61ba1c381b2b..53f10ab8a92a 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2353,6 +2353,9 @@ module std [system] {
     header "__std_mbstate_t.h"
     export *
   }
+  module log_hardening_failure {
+    header "__log_hardening_failure"
+  }
   module verbose_abort {
     header "__verbose_abort"
   }
diff --git a/libcxx/include/string b/libcxx/include/string
index 514dd91c7c17..d282071abf11 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -974,7 +974,12 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string()
       _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value)
-      : __rep_() {
+#  if _LIBCPP_STD_VER >= 20 // TODO(LLVM 23): Remove this condition; this is a workaround for https://llvm.org/PR154567
+      : __rep_(__short())
+#  else
+      : __rep_()
+#  endif
+  {
     __annotate_new(0);
   }
 
@@ -984,7 +989,12 @@ public:
 #  else
       _NOEXCEPT
 #  endif
-      : __rep_(), __alloc_(__a) {
+#  if _LIBCPP_STD_VER >= 20 // TODO(LLVM 23): Remove this condition; this is a workaround for https://llvm.org/PR154567
+      : __rep_(__short()),
+#  else
+      : __rep_(),
+#  endif
+        __alloc_(__a) {
     __annotate_new(0);
   }
 
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 97fe57a5f24f..f59fe0e08fcc 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -309,6 +309,7 @@ add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
 # Build the experimental static library
 set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
+  experimental/log_hardening_failure.cpp
   )
 
 if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
diff --git a/libcxx/src/experimental/log_hardening_failure.cpp b/libcxx/src/experimental/log_hardening_failure.cpp
new file mode 100644
index 000000000000..f836c1545224
--- /dev/null
+++ b/libcxx/src/experimental/log_hardening_failure.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__config>
+#include <__log_hardening_failure>
+#include <cstdio>
+
+#ifdef __BIONIC__
+#  include <syslog.h>
+#endif // __BIONIC__
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+void __log_hardening_failure(const char* message) noexcept {
+  // Always log the message to `stderr` in case the platform-specific system calls fail.
+  std::fputs(message, stderr);
+
+#if defined(__BIONIC__)
+  // Show error in logcat. The latter two arguments are ignored on Android.
+  openlog("libc++", 0, 0);
+  syslog(LOG_CRIT, "%s", message);
+  closelog();
+#endif
+}
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
new file mode 100644
index 000000000000..ea80359f1fea
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_map>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multimap<int, int> map;
+
+  map.insert(std::make_pair(1, 1));
+  map.insert(std::make_pair(1, 1));
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  std::pair<int, int> arr[] = {std::make_pair(1, 1), std::make_pair(1, 1)};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
new file mode 100644
index 000000000000..1a60cac158a4
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_set>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multiset<int> map;
+
+  map.insert(1);
+  map.insert(1);
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  int arr[] = {1, 1};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp b/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp
new file mode 100644
index 000000000000..dda071b8d029
--- /dev/null
+++ b/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Basic smoke test for `__log_hardening_failure`.
+//
+// UNSUPPORTED: c++03
+// UNSUPPORTED: libcpp-has-no-experimental-hardening-observe-semantic
+
+#include <__log_hardening_failure>
+
+#include "test_macros.h"
+
+ASSERT_NOEXCEPT(std::__log_hardening_failure(""));
+
+int main(int, char**) {
+  std::__log_hardening_failure("Some message");
+  // It's difficult to properly test platform-specific logging behavior of the function; just make sure it exists and
+  // can be called at runtime.
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp
new file mode 100644
index 000000000000..e0598b4ff174
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
+#include <cstdint>
+#include <map>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using map_alloc = std::map<T, T, std::less<T>, Alloc>;
+
+struct user_struct {
+  map_alloc<int, common_base_allocator<std::pair<const int, int> > > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 40, "");
+static_assert(sizeof(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 6, "");
+static_assert(sizeof(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 8, "");
+
+static_assert(sizeof(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 40, "");
+static_assert(sizeof(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 6, "");
+static_assert(sizeof(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 8, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::map<int, int, AlignedLess>) == 64, "");
+static_assert(TEST_ALIGNOF(std::map<int, int, AlignedLess>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 6, "");
+static_assert(sizeof(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 8, "");
+
+static_assert(sizeof(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 6, "");
+static_assert(sizeof(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 8, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::map<int, int, AlignedLess>) == 64);
+static_assert(TEST_ALIGNOF(std::map<int, int, AlignedLess>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp
new file mode 100644
index 000000000000..14f89b6c9dea
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
+#include <cstdint>
+#include <set>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using set_alloc = std::set<T, std::less<T>, Alloc>;
+
+struct user_struct {
+  set_alloc<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(set_alloc<int, std::allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, min_allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, test_allocator<int> >) == 40, "");
+static_assert(sizeof(set_alloc<int, small_iter_allocator<int> >) == 6, "");
+static_assert(sizeof(set_alloc<int, final_small_iter_allocator<int> >) == 8, "");
+
+static_assert(sizeof(set_alloc<char, std::allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, min_allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, test_allocator<char> >) == 40, "");
+static_assert(sizeof(set_alloc<char, small_iter_allocator<char> >) == 6, "");
+static_assert(sizeof(set_alloc<char, final_small_iter_allocator<char> >) == 8, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<int, std::allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, min_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, test_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, final_small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<char, std::allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, final_small_iter_allocator<char> >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::set<int, AlignedLess>) == 64, "");
+static_assert(TEST_ALIGNOF(std::set<int, AlignedLess>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(set_alloc<int, std::allocator<int> >) == 12, "");
+static_assert(sizeof(set_alloc<int, min_allocator<int> >) == 12, "");
+static_assert(sizeof(set_alloc<int, test_allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, small_iter_allocator<int> >) == 6, "");
+static_assert(sizeof(set_alloc<int, final_small_iter_allocator<int> >) == 8, "");
+
+static_assert(sizeof(set_alloc<char, std::allocator<char> >) == 12, "");
+static_assert(sizeof(set_alloc<char, min_allocator<char> >) == 12, "");
+static_assert(sizeof(set_alloc<char, test_allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, small_iter_allocator<char> >) == 6, "");
+static_assert(sizeof(set_alloc<char, final_small_iter_allocator<char> >) == 8, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<int, std::allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, min_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, test_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, final_small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<char, std::allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, final_small_iter_allocator<char> >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::set<int, AlignedLess>) == 64);
+static_assert(TEST_ALIGNOF(std::set<int, AlignedLess>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
index 55d42a8d017e..60c3e5bf31b0 100644
--- a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
@@ -12,8 +12,6 @@
 // unordered containers changes when bounded unique_ptr is enabled.
 // UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cstdint>
 #include <unordered_map>
 
@@ -66,10 +64,41 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 template <class T, class Alloc>
 using unordered_map_alloc = std::unordered_map<T, T, std::hash<T>, std::equal_to<T>, Alloc>;
 
+struct user_struct {
+  unordered_map_alloc<int, common_base_allocator<std::pair<const int, int> > > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 48, "");
+#  else
+static_assert(sizeof(user_struct) == 40, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 40, "");
 static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 40, "");
@@ -98,12 +127,22 @@ static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<
 
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
-
-// This part of the ABI has been broken between LLVM 19 and LLVM 20.
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 64, "");
+#  else
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96, "");
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 24, "");
+#  else
+static_assert(sizeof(user_struct) == 20, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 20, "");
 static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 20, "");
@@ -133,7 +172,12 @@ static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 64);
+#  else
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96);
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32);
 
 #else
diff --git a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
index bee2012bbea2..0216cb1213ed 100644
--- a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
@@ -12,8 +12,6 @@
 // unordered containers changes when bounded unique_ptr is enabled.
 // UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cstdint>
 #include <unordered_set>
 
@@ -66,10 +64,41 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 template <class T, class Alloc>
 using unordered_set_alloc = std::unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
 
+struct user_struct {
+  unordered_set_alloc<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 48, "");
+#  else
+static_assert(sizeof(user_struct) == 40, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 40, "");
 static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 40, "");
@@ -98,11 +127,22 @@ static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
-// This part of the ABI has been broken between LLVM 19 and LLVM 20.
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 64, "");
+#  else
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96, "");
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 24, "");
+#  else
+static_assert(sizeof(user_struct) == 20, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 20, "");
 static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 20, "");
@@ -131,7 +171,12 @@ static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 64);
+#  else
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96);
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32);
 
 #else
diff --git a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
index 30586d8b2422..7eaf64ea09d1 100644
--- a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
@@ -60,6 +60,25 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 #if __SIZE_WIDTH__ == 64
 
 static_assert(sizeof(std::deque<int>) == 48, "");
@@ -67,24 +86,38 @@ static_assert(sizeof(std::deque<int, min_allocator<int> >) == 48, "");
 static_assert(sizeof(std::deque<int, test_allocator<int> >) == 80, "");
 static_assert(sizeof(std::deque<int, small_iter_allocator<int> >) == 12, "");
 static_assert(sizeof(std::deque<int, final_small_iter_allocator<int> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 56, "");
+#  else
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 48, "");
+#  endif
 
 static_assert(sizeof(std::deque<char>) == 48, "");
 static_assert(sizeof(std::deque<char, min_allocator<char> >) == 48, "");
 static_assert(sizeof(std::deque<char, test_allocator<char> >) == 80, "");
 static_assert(sizeof(std::deque<char, small_iter_allocator<char> >) == 12, "");
 static_assert(sizeof(std::deque<char, final_small_iter_allocator<char> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 56, "");
+#  else
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 48, "");
+#  endif
 
 static_assert(TEST_ALIGNOF(std::deque<int>) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, min_allocator<int> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, test_allocator<int> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, small_iter_allocator<int> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<int, final_small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<int, common_base_allocator<int> >) == 8, "");
 
 static_assert(TEST_ALIGNOF(std::deque<char>) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, min_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, small_iter_allocator<char> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<char, final_small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<char, common_base_allocator<char> >) == 8, "");
 
 #elif __SIZE_WIDTH__ == 32
 
@@ -93,24 +126,38 @@ static_assert(sizeof(std::deque<int, min_allocator<int> >) == 24, "");
 static_assert(sizeof(std::deque<int, test_allocator<int> >) == 48, "");
 static_assert(sizeof(std::deque<int, small_iter_allocator<int> >) == 12, "");
 static_assert(sizeof(std::deque<int, final_small_iter_allocator<int> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 28, "");
+#  else
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 24, "");
+#  endif
 
 static_assert(sizeof(std::deque<char>) == 24, "");
 static_assert(sizeof(std::deque<char, min_allocator<char> >) == 24, "");
 static_assert(sizeof(std::deque<char, test_allocator<char> >) == 48, "");
 static_assert(sizeof(std::deque<char, small_iter_allocator<char> >) == 12, "");
 static_assert(sizeof(std::deque<char, final_small_iter_allocator<char> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 28, "");
+#  else
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 24, "");
+#  endif
 
 static_assert(TEST_ALIGNOF(std::deque<int>) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, min_allocator<int> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, test_allocator<int> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, small_iter_allocator<int> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<int, final_small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<int, common_base_allocator<int> >) == 4, "");
 
 static_assert(TEST_ALIGNOF(std::deque<char>) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, min_allocator<char> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, test_allocator<char> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, small_iter_allocator<char> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<char, final_small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<char, common_base_allocator<char> >) == 4, "");
 
 #else
 #  error std::size_t has an unexpected size
diff --git a/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp
new file mode 100644
index 000000000000..cb500f9c4d61
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <forward_list>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::int16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::forward_list<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+static_assert(sizeof(user_struct) == 16, "");
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(std::forward_list<int>) == 8, "");
+static_assert(sizeof(std::forward_list<int, min_allocator<int> >) == 8, "");
+static_assert(sizeof(std::forward_list<int, test_allocator<int> >) == 24, "");
+static_assert(sizeof(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(sizeof(std::forward_list<char>) == 8, "");
+static_assert(sizeof(std::forward_list<char, min_allocator<char> >) == 8, "");
+static_assert(sizeof(std::forward_list<char, test_allocator<char> >) == 24, "");
+static_assert(sizeof(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<int>) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, min_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, test_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<char>) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+#elif __SIZE_WIDTH__ == 32
+static_assert(sizeof(user_struct) == 8, "");
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(std::forward_list<int>) == 4, "");
+static_assert(sizeof(std::forward_list<int, min_allocator<int> >) == 4, "");
+static_assert(sizeof(std::forward_list<int, test_allocator<int> >) == 16, "");
+static_assert(sizeof(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(sizeof(std::forward_list<char>) == 4, "");
+static_assert(sizeof(std::forward_list<char, min_allocator<char> >) == 4, "");
+static_assert(sizeof(std::forward_list<char, test_allocator<char> >) == 16, "");
+static_assert(sizeof(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<int>) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, min_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, test_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<char>) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
index a16ae1d52792..4b9c295c46f2 100644
--- a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
 #include <cstdint>
 #include <list>
 
@@ -38,7 +40,38 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::list<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::list<int>) == 24, "");
 static_assert(sizeof(std::list<int, min_allocator<int> >) == 24, "");
@@ -61,6 +94,13 @@ static_assert(TEST_ALIGNOF(std::list<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::list<char, small_iter_allocator<char> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::list<int>) == 12, "");
 static_assert(sizeof(std::list<int, min_allocator<int> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
index cc6b0d94e7da..598d54ffb91a 100644
--- a/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
@@ -40,7 +40,37 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::vector<bool, common_base_allocator<bool> > v;
+  [[no_unique_address]] common_base_allocator<bool> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::vector<bool>) == 24, "");
 static_assert(sizeof(std::vector<bool, min_allocator<bool> >) == 24, "");
@@ -53,6 +83,13 @@ static_assert(TEST_ALIGNOF(std::vector<bool, test_allocator<bool> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::vector<bool, small_iter_allocator<bool> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::vector<bool>) == 12, "");
 static_assert(sizeof(std::vector<bool, min_allocator<bool> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
index 57684951c8e8..6dc0504991b3 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
 #include <cstdint>
 #include <vector>
 
@@ -46,7 +48,38 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::vector<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::vector<int>) == 24, "");
 static_assert(sizeof(std::vector<int, min_allocator<int> >) == 24, "");
@@ -69,6 +102,13 @@ static_assert(TEST_ALIGNOF(std::vector<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::vector<char, small_iter_allocator<char> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::vector<int>) == 12, "");
 static_assert(sizeof(std::vector<int, min_allocator<int> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp
new file mode 100644
index 000000000000..cf802e214d07
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <string>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+public:
+  using value_type        = T;
+  using difference_type   = std::int16_t;
+  using pointer           = small_pointer;
+  using reference         = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+private:
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::int16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class Alloc>
+using string_alloc = std::basic_string<char, std::char_traits<char>, Alloc>;
+
+struct user_struct {
+  string_alloc<common_base_allocator<char> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+static_assert(sizeof(user_struct) == 32, "");
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(string_alloc<std::allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<min_allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<test_allocator<char> >) == 32, "");
+static_assert(sizeof(string_alloc<small_iter_allocator<char> >) == 6, "");
+
+static_assert(TEST_ALIGNOF(string_alloc<std::allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<small_iter_allocator<char> >) == 2, "");
+
+#elif __SIZE_WIDTH__ == 32
+static_assert(sizeof(user_struct) == 16, "");
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(string_alloc<std::allocator<char> >) == 12, "");
+static_assert(sizeof(string_alloc<min_allocator<char> >) == 12, "");
+static_assert(sizeof(string_alloc<test_allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<small_iter_allocator<char> >) == 6, "");
+
+static_assert(TEST_ALIGNOF(string_alloc<std::allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<small_iter_allocator<char> >) == 2, "");
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
index 3cf497da233f..3d97446ffe82 100644
--- a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
+++ b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
@@ -29,3 +29,7 @@
 #if !_LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM
 #  error "-fexperimental-library should enable the syncstream header"
 #endif
+
+#if !_LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC
+#  error "-fexperimental-library should allow using the Hardening observe semantic"
+#endif
diff --git a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
index 419a603a037f..2bc4648878f8 100644
--- a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
@@ -8,6 +8,8 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// Without the assertion, the test will most likely time out.
+// UNSUPPORTED: libcpp-assertion-semantic={{ignore|observe}}
 
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
diff --git a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
index e61679554a62..30d36b5f6d7b 100644
--- a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
@@ -18,6 +18,8 @@
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// Without the assertion, the test will most likely time out.
+// UNSUPPORTED: libcpp-assertion-semantic={{ignore|observe}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <latch>
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
index 91a7db1d9a7c..1a2d080d10c3 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
@@ -124,8 +124,9 @@ void test() {
 #if _LIBCPP_HAS_EXCEPTIONS
   // expected-error-re@*:* {{call to deleted constructor of{{.*}}}}
   // expected-error-re@*:* {{call to deleted constructor of{{.*}}}}
-  // expected-error-re@*:* 1-2{{call to deleted constructor of{{.*}}}}
-  // expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
 #endif
+// These diagnostics can also additionally be produced by static_assert (see GH150601).
+// expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
+// expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
 }
 // clang-format on
diff --git a/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
index c7ba76517896..62491e2bc443 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
@@ -33,6 +33,10 @@ int main(int, char**) {
     typedef std::map<int, double, transparent_less_not_referenceable> M;
     assert(M().count(C2Int{5}) == 0);
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    assert(M().count(C2Int{5}) == 0);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
index 75724bdb387c..57ce9339f632 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
@@ -40,6 +40,13 @@ int main(int, char**) {
     P result = example.equal_range(C2Int{5});
     assert(result.first == result.second);
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    using P = std::pair<typename M::iterator, typename M::iterator>;
+    M example;
+    P result = example.equal_range(C2Int{5});
+    assert(result.first == result.second);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
index 9825d6c5879b..3f09d5608772 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.find(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.find(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
index fe7fe381a86e..308a2ed1e3af 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.lower_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.lower_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
index 525aa673ea74..332b71a84e9f 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.upper_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.upper_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
index 233d1a11e1d6..36f0ac2647ba 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
@@ -33,6 +33,10 @@ int main(int, char**) {
     typedef std::multimap<int, double, transparent_less_not_referenceable> M;
     assert(M().count(C2Int{5}) == 0);
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    assert(M().count(C2Int{5}) == 0);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
index 0bead6c7938d..a362c03e2638 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
@@ -40,6 +40,13 @@ int main(int, char**) {
     P result = example.equal_range(C2Int{5});
     assert(result.first == result.second);
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    using P = std::pair<typename M::iterator, typename M::iterator>;
+    M example;
+    P result = example.equal_range(C2Int{5});
+    assert(result.first == result.second);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
index 701d4e314e7a..ccb0900e7683 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.find(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.find(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
index 79f994847f13..4b4853062001 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.lower_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.lower_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
index 62f52416b915..f2ae94577b6c 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.upper_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.upper_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
index 09086a4c046d..8e57e8913dcb 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // is_bounded_array<T>
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
index 9aac871f2633..bd7da40daf2b 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // has_unique_object_representations
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
index 7f4c90922d6c..e2abfd17d27c 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
@@ -82,6 +82,10 @@ TEST_CONSTEXPR_CXX23 bool test_basic() {
     p.get_deleter().set_state(5);
     assert(p.get_deleter().state() == 5);
   }
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     std::unique_ptr<ElemType, DefaultCtorDeleter<ElemType> > p;
     assert(p.get() == 0);
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
index 45017a03b95d..f5e0541684d1 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
@@ -47,6 +47,10 @@ TEST_CONSTEXPR_CXX23 void test_basic() {
     assert(p.get() == 0);
     assert(p.get_deleter().state() == 0);
   }
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     std::unique_ptr<VT, DefaultCtorDeleter<VT> > p(nullptr);
     assert(p.get() == 0);
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
index cbce5c9c74c5..e9912d4574af 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
@@ -75,6 +75,10 @@ TEST_CONSTEXPR_CXX23 void test_pointer() {
   }
   if (!TEST_IS_CONSTANT_EVALUATED)
     assert(A::count == 0);
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     A* p = newValue<ValueT>(expect_alive);
     if (!TEST_IS_CONSTANT_EVALUATED)
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
index 142da1d820d9..6111138726db 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
@@ -173,6 +173,11 @@ void test_vector_bool() {
   assert(std::get<0>(v) == true);
 }
 
+struct ConvertibleFromAny {
+  template <class V>
+  ConvertibleFromAny(V) {}
+};
+
 int main(int, char**) {
   test_T_ctor_basic();
   test_T_ctor_noexcept();
@@ -180,5 +185,16 @@ int main(int, char**) {
   test_no_narrowing_check_for_class_types();
   test_construction_with_repeated_types();
   test_vector_bool();
+
+  { // Check that the constraints are evaluated lazily (see https://github.com/llvm/llvm-project/issues/151328)
+    struct Matcher {
+      Matcher() {}
+      Matcher(std::variant<ConvertibleFromAny>) {}
+    };
+
+    Matcher vec;
+    [[maybe_unused]] Matcher m = std::move(vec);
+  }
+
   return 0;
 }
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index a279400d651b..8416de76cfd6 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -44,15 +44,32 @@ static constexpr const char* Marker = "###";
 using MatchResult = std::pair<bool, std::string>;
 using Matcher     = std::function<MatchResult(const std::string& /*text*/)>;
 
-MatchResult MatchAssertionMessage(const std::string& text, std::string_view expected_message) {
+// Using the marker makes matching more precise, but we cannot output the marker when the `observe` semantic is used
+// (because it doesn't allow customizing the logging function). If the marker is not available, fall back to using less
+// precise matching by just the error message.
+MatchResult MatchAssertionMessage(const std::string& text, std::string_view expected_message, bool use_marker) {
   // Extract information from the error message. This has to stay synchronized with how we format assertions in the
   // library.
-  std::regex assertion_format(".*###\\n(.*):(\\d+): assertion (.*) failed: (.*)\\n###");
+  std::string assertion_format_string = [&] {
+    if (use_marker)
+      return (".*###\\n(.*):(\\d+): libc\\+\\+ Hardening assertion (.*) failed: (.*)\\n###");
+    return ("(.*):(\\d+): libc\\+\\+ Hardening assertion (.*) failed: (.*)\\n");
+  }();
+  std::regex assertion_format(assertion_format_string);
 
   std::smatch match_result;
-  bool has_match = std::regex_match(text, match_result, assertion_format);
-  assert(has_match);
-  assert(match_result.size() == 5);
+  // If a non-terminating assertion semantic is used, more than one assertion might be triggered before the process
+  // dies, so we cannot expect the entire target string to match.
+  bool has_match = std::regex_search(text, match_result, assertion_format);
+  if (!has_match || match_result.size() != 5) {
+    std::stringstream matching_error;
+    matching_error                                                     //
+        << "Failed to parse the assertion message.\n"                  //
+        << "Using marker:        " << use_marker << "\n"               //
+        << "Expected message:   '" << expected_message.data() << "'\n" //
+        << "Stderr contents:    '" << text.c_str() << "'\n";
+    return MatchResult(/*success=*/false, matching_error.str());
+  }
 
   const std::string& file = match_result[1];
   int line                = std::stoi(match_result[2]);
@@ -72,9 +89,9 @@ MatchResult MatchAssertionMessage(const std::string& text, std::string_view expe
   return MatchResult(/*success=*/true, /*maybe_error=*/"");
 }
 
-Matcher MakeAssertionMessageMatcher(std::string_view assertion_message) {
+Matcher MakeAssertionMessageMatcher(std::string_view assertion_message, bool use_marker = true) {
   return [=](const std::string& text) { //
-    return MatchAssertionMessage(text, assertion_message);
+    return MatchAssertionMessage(text, assertion_message, use_marker);
   };
 }
 
@@ -85,13 +102,17 @@ Matcher MakeAnyMatcher() {
 }
 
 enum class DeathCause {
-  // Valid causes
+  // Valid causes.
   VerboseAbort = 1,
   StdAbort,
   StdTerminate,
   Trap,
-  // Invalid causes
+  // Causes that might be invalid or might stem from undefined behavior (relevant for non-terminating assertion
+  // semantics).
   DidNotDie,
+  Segfault,
+  ArithmeticError,
+  // Always invalid causes.
   SetupFailure,
   Unknown
 };
@@ -108,6 +129,16 @@ bool IsValidCause(DeathCause cause) {
   }
 }
 
+bool IsTestSetupErrorCause(DeathCause cause) {
+  switch (cause) {
+  case DeathCause::SetupFailure:
+  case DeathCause::Unknown:
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::string ToString(DeathCause cause) {
   switch (cause) {
   case DeathCause::VerboseAbort:
@@ -120,10 +151,14 @@ std::string ToString(DeathCause cause) {
     return "trap";
   case DeathCause::DidNotDie:
     return "<invalid cause (child did not die)>";
+  case DeathCause::Segfault:
+    return "<invalid cause (segmentation fault)>";
+  case DeathCause::ArithmeticError:
+    return "<invalid cause (fatal arithmetic error)>";
   case DeathCause::SetupFailure:
-    return "<invalid cause (child failed to set up test environment)>";
+    return "<test setup error (child failed to set up test environment)>";
   case DeathCause::Unknown:
-    return "<invalid cause (cause unknown)>";
+    return "<test setup error (test doesn't know how to interpret the death cause)>";
   }
 
   assert(false && "Unreachable");
@@ -225,9 +260,38 @@ class DeathTest {
     return DeathTestResult(Outcome::Success, cause);
   }
 
-  void PrintFailureDetails(std::string_view failure_description, std::string_view stmt, DeathCause cause) const {
-    std::fprintf(
-        stderr, "Failure: EXPECT_DEATH( %s ) failed!\n(reason: %s)\n\n", stmt.data(), failure_description.data());
+  // When non-terminating assertion semantics are used, the program will invoke UB which might or might not crash the
+  // process; we make sure that the execution produces the expected error message but otherwise consider the test run
+  // successful whether the child process dies or not.
+  template <class Func>
+  DeathTestResult RunWithoutGuaranteedDeath(Func&& func, const Matcher& matcher) {
+    std::signal(SIGABRT, [](int) { StopChildProcess(DeathCause::StdAbort); });
+    std::set_terminate([] { StopChildProcess(DeathCause::StdTerminate); });
+
+    DeathCause cause = Run(func);
+
+    if (IsTestSetupErrorCause(cause)) {
+      return DeathTestResult(Outcome::InvalidCause, cause, ToString(cause));
+    }
+
+    MatchResult match_result = matcher(GetChildStdErr());
+    if (!match_result.first) {
+      auto failure_description = std::string("Child produced a different error message\n") + match_result.second;
+      return DeathTestResult(Outcome::UnexpectedErrorMessage, cause, failure_description);
+    }
+
+    return DeathTestResult(Outcome::Success, cause);
+  }
+
+  void PrintFailureDetails(std::string_view invocation,
+                           std::string_view failure_description,
+                           std::string_view stmt,
+                           DeathCause cause) const {
+    std::fprintf(stderr,
+                 "Failure: %s( %s ) failed!\n(reason: %s)\n\n",
+                 invocation.data(),
+                 stmt.data(),
+                 failure_description.data());
 
     if (cause != DeathCause::Unknown) {
       std::fprintf(stderr, "child exit code: %d\n", GetChildExitCode());
@@ -311,10 +375,16 @@ class DeathTest {
 
     if (WIFSIGNALED(status_value)) {
       exit_code_ = WTERMSIG(status_value);
-      // `__builtin_trap` generqtes `SIGILL` on x86 and `SIGTRAP` on ARM.
+      // `__builtin_trap` generates `SIGILL` on x86 and `SIGTRAP` on ARM.
       if (exit_code_ == SIGILL || exit_code_ == SIGTRAP) {
         return DeathCause::Trap;
       }
+      if (exit_code_ == SIGSEGV) {
+        return DeathCause::Segfault;
+      }
+      if (exit_code_ == SIGFPE) {
+        return DeathCause::ArithmeticError;
+      }
     }
 
     return DeathCause::Unknown;
@@ -357,7 +427,7 @@ bool ExpectDeath(
   DeathTest test_case;
   DeathTestResult test_result = test_case.Run(expected_causes, func, matcher);
   if (!test_result.success()) {
-    test_case.PrintFailureDetails(test_result.failure_description(), stmt, test_result.cause());
+    test_case.PrintFailureDetails("EXPECT_DEATH", test_result.failure_description(), stmt, test_result.cause());
   }
 
   return test_result.success();
@@ -378,6 +448,22 @@ bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
   return ExpectDeath(std::array<DeathCause, 1>{expected_cause}, stmt, func, MakeAnyMatcher());
 }
 
+template <class Func>
+bool ExpectLog(const char* stmt, Func&& func, const Matcher& matcher) {
+  DeathTest test_case;
+  DeathTestResult test_result = test_case.RunWithoutGuaranteedDeath(func, matcher);
+  if (!test_result.success()) {
+    test_case.PrintFailureDetails("EXPECT_LOG", test_result.failure_description(), stmt, test_result.cause());
+  }
+
+  return test_result.success();
+}
+
+template <class Func>
+bool ExpectLog(const char* stmt, Func&& func) {
+  return ExpectLog(stmt, func, MakeAnyMatcher());
+}
+
 // clang-format off
 
 /// Assert that the specified expression aborts with the expected cause and, optionally, error message.
@@ -392,13 +478,28 @@ bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
 #define EXPECT_STD_TERMINATE(...)                 \
     assert(  ExpectDeath(DeathCause::StdTerminate, #__VA_ARGS__, __VA_ARGS__)  )
 
-#if defined(_LIBCPP_HARDENING_MODE) && _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#if defined(_LIBCPP_ASSERTION_SEMANTIC)
+
+#if _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
 #define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
     assert(( ExpectDeath(DeathCause::VerboseAbort, #expr, [&]() { (void)(expr); }, MakeAssertionMessageMatcher(message)) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectDeath(DeathCause::Trap,         #expr, [&]() { (void)(expr); }) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectLog(#expr, [&]() { (void)(expr); }, MakeAssertionMessageMatcher(message, /*use_marker=*/false)) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectLog(#expr, [&]() { (void)(expr); }) ))
+#else
+#error "Unknown value for _LIBCPP_ASSERTION_SEMANTIC"
+#endif // _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+
 #else
 #define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
     assert(( ExpectDeath(DeathCause::Trap,         #expr, [&]() { (void)(expr); }) ))
-#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#endif // defined(_LIBCPP_ASSERTION_SEMANTIC)
 
 // clang-format on
 
diff --git a/libcxx/test/support/is_transparent.h b/libcxx/test/support/is_transparent.h
index 700c894a8b60..4b2a458f574a 100644
--- a/libcxx/test/support/is_transparent.h
+++ b/libcxx/test/support/is_transparent.h
@@ -36,6 +36,17 @@ struct transparent_less_not_referenceable
     using is_transparent = void () const &;  // it's a type; a weird one, but a type
 };
 
+// Prevent regression when empty base class optimization is not suitable.
+// See https://github.com/llvm/llvm-project/issues/152543.
+struct transparent_less_nonempty {
+  template <class T, class U>
+  constexpr bool operator()(T&& t, U&& u) const {
+    return std::forward<T>(t) < std::forward<U>(u);
+  }
+  struct is_transparent {
+  } pad_; // making this comparator non-empty
+};
+
 struct transparent_less_no_type
 {
     template <class T, class U>
diff --git a/libcxx/test/support/test.support/test_check_assertion.pass.cpp b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
index 4dfc5319aaf9..78e47b32cdd2 100644
--- a/libcxx/test/support/test.support/test_check_assertion.pass.cpp
+++ b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
@@ -53,7 +53,7 @@ bool TestDeathTest(
   }
 
   if (!maybe_failure_description.empty()) {
-    test_case.PrintFailureDetails(maybe_failure_description, stmt, test_result.cause());
+    test_case.PrintFailureDetails("EXPECT_DEATH", maybe_failure_description, stmt, test_result.cause());
     return false;
   }
 
@@ -76,9 +76,9 @@ DeathCause assertion_death_cause = DeathCause::Trap;
 #endif
 
 int main(int, char**) {
-  auto fail_assert     = [] { _LIBCPP_ASSERT(false, "Some message"); };
-  Matcher good_matcher = MakeAssertionMessageMatcher("Some message");
-  Matcher bad_matcher  = MakeAssertionMessageMatcher("Bad expected message");
+  [[maybe_unused]] auto fail_assert = [] { _LIBCPP_ASSERT(false, "Some message"); };
+  Matcher good_matcher              = MakeAssertionMessageMatcher("Some message");
+  Matcher bad_matcher               = MakeAssertionMessageMatcher("Bad expected message");
 
   // Test the implementation of death tests. We're bypassing the assertions added by the actual `EXPECT_DEATH` macros
   // which allows us to test failure cases (where the assertion would fail) as well.
@@ -89,16 +89,22 @@ int main(int, char**) {
     // Success -- trapping.
     TEST_DEATH_TEST(Outcome::Success, DeathCause::Trap, __builtin_trap());
 
+    // `_LIBCPP_ASSERT` does not terminate the program if the `observe` semantic is used, so these tests would fail with
+    // `DidNotDie` cause.
+#if _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+
     // Success -- assertion failure with any matcher.
     TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, MakeAnyMatcher(), fail_assert());
 
     // Success -- assertion failure with a specific matcher.
     TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, good_matcher, fail_assert());
 
-#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
     // Failure -- error message doesn't match.
     TEST_DEATH_TEST_MATCHES(Outcome::UnexpectedErrorMessage, assertion_death_cause, bad_matcher, fail_assert());
-#endif
+#  endif
+
+#endif // _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
 
     // Invalid cause -- child did not die.
     TEST_DEATH_TEST(Outcome::InvalidCause, DeathCause::DidNotDie, ((void)0));
@@ -125,7 +131,9 @@ int main(int, char**) {
     EXPECT_DEATH_MATCHES(simple_matcher, invoke_verbose_abort());
     EXPECT_STD_ABORT(invoke_abort());
     EXPECT_STD_TERMINATE([] { std::terminate(); });
+#if _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
     TEST_LIBCPP_ASSERT_FAILURE(fail_assert(), "Some message");
+#endif
   }
 
   return 0;
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index d8b23be9a032..57ecf1e49dbf 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -442,6 +442,12 @@ generic-hardening-mode-extensive)
     check-runtimes
     check-abi-list
 ;;
+generic-hardening-mode-extensive-observe-semantic)
+    clean
+    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake"
+    check-runtimes
+    check-abi-list
+;;
 generic-hardening-mode-debug)
     clean
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake"
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index adfb2a9f6950..81c613421a46 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -361,6 +361,7 @@ def getSuitableClangTidy(cfg):
             AddFeature("libcpp-has-no-incomplete-pstl"),
             AddFeature("libcpp-has-no-experimental-tzdb"),
             AddFeature("libcpp-has-no-experimental-syncstream"),
+            AddFeature("libcpp-has-no-experimental-hardening-observe-semantic"),
         ],
     ),
     # TODO: This can be improved once we use a version of GoogleBenchmark that supports the dry-run mode.
@@ -454,5 +455,24 @@ def getSuitableClangTidy(cfg):
         help="Whether to test the main or C++03-specific headers. Only changes behaviour when std=c++03.",
         actions=lambda enabled: [] if not enabled else [AddFlag("-D_LIBCPP_USE_FROZEN_CXX03_HEADERS"), AddFeature("FROZEN-CXX03-HEADERS-FIXME")],
     ),
+    Parameter(
+        name='assertion_semantic',
+        choices=["ignore", "observe", "quick_enforce", "enforce", "undefined"],
+        type=str,
+        default="undefined",
+        help="Whether to override the assertion semantic used by hardening. This is only meaningful when running the "
+        "tests against libc++ with hardening enabled. By default, no assertion semantic is specified explicitly, so "
+        "the default one will be used (depending on the hardening mode).",
+        actions=lambda assertion_semantic: filter(
+            None,
+            [
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_IGNORE")        if assertion_semantic == "ignore" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_OBSERVE")       if assertion_semantic == "observe" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE") if assertion_semantic == "quick_enforce" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_ENFORCE")       if assertion_semantic == "enforce" else None,
+                AddFeature("libcpp-assertion-semantic={}".format(assertion_semantic))                   if assertion_semantic != "undefined" else None,
+            ],
+        ),
+    ),
 ]
 # fmt: on
diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index f115658f9f3c..d352405e905b 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -16,6 +16,7 @@
 #  include <__cxx03/__verbose_trap>
 #else
 #  include <__config>
+#  include <__log_hardening_failure>
 #  include <__verbose_abort>
 #  include <__verbose_trap>
 #endif
@@ -24,14 +25,40 @@
 #  pragma GCC system_header
 #endif
 
-#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
-#  define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+// Keep the old implementation that doesn't support assertion semantics for backward compatibility with the frozen C++03
+// mode.
+#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+#  else
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+#  endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 
 #else
 
-#  define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+#  if _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
+#    define _LIBCPP_ASSERTION_HANDLER(message) ((void)0)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_LOG_HARDENING_FAILURE(message)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+
+#  else
+
+#    error _LIBCPP_ASSERTION_SEMANTIC must be set to one of the following values: \
+_LIBCPP_ASSERTION_SEMANTIC_IGNORE, \
+_LIBCPP_ASSERTION_SEMANTIC_OBSERVE, \
+_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE, \
+_LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+
+#  endif // _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
 
-#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP___ASSERTION_HANDLER
diff --git a/libcxxabi/src/demangle/DemangleConfig.h b/libcxxabi/src/demangle/DemangleConfig.h
index 06fd223f5553..7904e9d1eb13 100644
--- a/libcxxabi/src/demangle/DemangleConfig.h
+++ b/libcxxabi/src/demangle/DemangleConfig.h
@@ -19,6 +19,14 @@
 #include "../abort_message.h"
 #endif
 
+#ifndef _LIBCPP_LOG_HARDENING_FAILURE
+// Libc++abi does not have any functionality to log and continue, so we drop
+// error messages when we build the demangler with `observe` assertion semantic.
+// Once the layering with libc++ is improved, this could use the libc++
+// functionality to log hardening failures.
+#define _LIBCPP_LOG_HARDENING_FAILURE(message) ((void)0)
+#endif
+
 #include <version>
 
 #ifdef _MSC_VER
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 55db035e6204..9a1afd3721f5 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -173,7 +173,8 @@ bool DwarfFDECache<A>::_registeredForDyldUnloads = false;
 #endif
 
 template <typename A>
-typename A::pint_t DwarfFDECache<A>::findFDE(pint_t mh, pint_t pc) {
+typename DwarfFDECache<A>::pint_t DwarfFDECache<A>::findFDE(pint_t mh,
+                                                            pint_t pc) {
   pint_t result = 0;
   _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared());
   for (entry *p = _buffer; p < _bufferUsed; ++p) {
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 79b63e5b7236..91b6e632fa7e 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -192,6 +192,18 @@ struct Configuration {
   // Used for /lldltocachepolicy=policy
   llvm::CachePruningPolicy ltoCachePolicy;
 
+  // Used for /thinlto-distributor:<path>
+  StringRef dtltoDistributor;
+
+  // Used for /thinlto-distributor-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoDistributorArgs;
+
+  // Used for /thinlto-remote-compiler:<path>
+  StringRef dtltoCompiler;
+
+  // Used for /thinlto-remote-compiler-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerArgs;
+
   // Used for /opt:[no]ltodebugpassmanager
   bool ltoDebugPassManager = false;
 
@@ -307,7 +319,7 @@ struct Configuration {
   bool warnDebugInfoUnusable = true;
   bool warnLongSectionNames = true;
   bool warnStdcallFixup = true;
-  bool warnExportedDllMain = true;
+  bool warnImportedDllMain = true;
   bool incremental = true;
   bool integrityCheck = false;
   bool killAt = false;
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index c327da28ce13..3ce8853adb2a 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -244,40 +244,36 @@ static const uint8_t thunkX64[] = {
 };
 
 static const uint8_t tailMergeX64[] = {
-    0x51,                               // push    rcx
-    0x52,                               // push    rdx
-    0x41, 0x50,                         // push    r8
-    0x41, 0x51,                         // push    r9
-    0x48, 0x83, 0xEC, 0x48,             // sub     rsp, 48h
-    0x66, 0x0F, 0x7F, 0x04, 0x24,       // movdqa  xmmword ptr [rsp], xmm0
-    0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x10, // movdqa  xmmword ptr [rsp+10h], xmm1
-    0x66, 0x0F, 0x7F, 0x54, 0x24, 0x20, // movdqa  xmmword ptr [rsp+20h], xmm2
-    0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x30, // movdqa  xmmword ptr [rsp+30h], xmm3
-    0x48, 0x8B, 0xD0,                   // mov     rdx, rax
-    0x48, 0x8D, 0x0D, 0, 0, 0, 0,       // lea     rcx, [___DELAY_IMPORT_...]
-    0xE8, 0, 0, 0, 0,                   // call    __delayLoadHelper2
-    0x66, 0x0F, 0x6F, 0x04, 0x24,       // movdqa  xmm0, xmmword ptr [rsp]
-    0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x10, // movdqa  xmm1, xmmword ptr [rsp+10h]
-    0x66, 0x0F, 0x6F, 0x54, 0x24, 0x20, // movdqa  xmm2, xmmword ptr [rsp+20h]
-    0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x30, // movdqa  xmm3, xmmword ptr [rsp+30h]
-    0x48, 0x83, 0xC4, 0x48,             // add     rsp, 48h
-    0x41, 0x59,                         // pop     r9
-    0x41, 0x58,                         // pop     r8
-    0x5A,                               // pop     rdx
-    0x59,                               // pop     rcx
-    0xFF, 0xE0,                         // jmp     rax
+    0x48, 0x89, 0x4C, 0x24, 0x08,          // mov    qword ptr [rsp+8], rcx
+    0x48, 0x89, 0x54, 0x24, 0x10,          // mov    qword ptr [rsp+10h], rdx
+    0x4C, 0x89, 0x44, 0x24, 0x18,          // mov    qword ptr [rsp+18h], r8
+    0x4C, 0x89, 0x4C, 0x24, 0x20,          // mov    qword ptr [rsp+20h], r9
+    0x48, 0x83, 0xEC, 0x68,                // sub    rsp, 68h
+    0x66, 0x0F, 0x7F, 0x44, 0x24, 0x20,    // movdqa xmmword ptr [rsp+20h], xmm0
+    0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x30,    // movdqa xmmword ptr [rsp+30h], xmm1
+    0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40,    // movdqa xmmword ptr [rsp+40h], xmm2
+    0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50,    // movdqa xmmword ptr [rsp+50h], xmm3
+    0x48, 0x8B, 0xD0,                      // mov    rdx, rax
+    0x48, 0x8D, 0x0D, 0, 0, 0, 0,          // lea    rcx, [___DELAY_IMPORT_...]
+    0xE8, 0, 0, 0, 0,                      // call   __delayLoadHelper2
+    0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20,    // movdqa xmm0, xmmword ptr [rsp+20h]
+    0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30,    // movdqa xmm1, xmmword ptr [rsp+30h]
+    0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40,    // movdqa xmm2, xmmword ptr [rsp+40h]
+    0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50,    // movdqa xmm3, xmmword ptr [rsp+50h]
+    0x48, 0x8B, 0x4C, 0x24, 0x70,          // mov    rcx, qword ptr [rsp+70h]
+    0x48, 0x8B, 0x54, 0x24, 0x78,          // mov    rdx, qword ptr [rsp+78h]
+    0x4C, 0x8B, 0x84, 0x24, 0x80, 0, 0, 0, // mov    r8, qword ptr [rsp+80h]
+    0x4C, 0x8B, 0x8C, 0x24, 0x88, 0, 0, 0, // mov    r9, qword ptr [rsp+88h]
+    0x48, 0x83, 0xC4, 0x68,                // add    rsp, 68h
+    0xFF, 0xE0,                            // jmp    rax
 };
 
 static const uint8_t tailMergeUnwindInfoX64[] = {
     0x01,       // Version=1, Flags=UNW_FLAG_NHANDLER
-    0x0a,       // Size of prolog
-    0x05,       // Count of unwind codes
+    0x18,       // Size of prolog
+    0x01,       // Count of unwind codes
     0x00,       // No frame register
-    0x0a, 0x82, // Offset 0xa: UWOP_ALLOC_SMALL(0x48)
-    0x06, 0x02, // Offset 6: UWOP_ALLOC_SMALL(8)
-    0x04, 0x02, // Offset 4: UWOP_ALLOC_SMALL(8)
-    0x02, 0x02, // Offset 2: UWOP_ALLOC_SMALL(8)
-    0x01, 0x02, // Offset 1: UWOP_ALLOC_SMALL(8)
+    0x18, 0xC2, // Offset 0x18: UWOP_ALLOC_SMALL(0x68)
     0x00, 0x00  // Padding to align on 32-bits
 };
 
@@ -378,8 +374,8 @@ class TailMergeChunkX64 : public NonSectionCodeChunk {
 
   void writeTo(uint8_t *buf) const override {
     memcpy(buf, tailMergeX64, sizeof(tailMergeX64));
-    write32le(buf + 39, desc->getRVA() - rva - 43);
-    write32le(buf + 44, helper->getRVA() - rva - 48);
+    write32le(buf + 54, desc->getRVA() - rva - 58);
+    write32le(buf + 59, helper->getRVA() - rva - 63);
   }
 
   Chunk *desc = nullptr;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 283aeed1a19c..570b8f9d0590 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -274,8 +274,13 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
       make<std::unique_ptr<Archive>>(std::move(file)); // take ownership
 
       int memberIndex = 0;
-      for (MemoryBufferRef m : getArchiveMembers(ctx, archive))
-        addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+      for (MemoryBufferRef m : getArchiveMembers(ctx, archive)) {
+        if (!archive->isThin())
+          addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+        else
+          addThinArchiveBuffer(m, "<whole-archive>");
+      }
+
       return;
     }
     addFile(make<ArchiveFile>(ctx, mbref));
@@ -386,6 +391,14 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
+void LinkerDriver::addThinArchiveBuffer(MemoryBufferRef mb, StringRef symName) {
+  // Pass an empty string as the archive name and an offset of 0 so that
+  // the original filename is used as the buffer identifier. This is
+  // useful for DTLTO, where having the member identifier be the actual
+  // path on disk enables distribution of bitcode files during ThinLTO.
+  addArchiveBuffer(mb, symName, /*parentName=*/"", /*OffsetInArchive=*/0);
+}
+
 void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
                                         const Archive::Symbol &sym,
                                         StringRef parentName) {
@@ -422,11 +435,8 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
       reportBufferError(errorCodeToError(mbOrErr.second), childName);
     llvm::TimeTraceScope timeScope("Archive: ",
                                    mbOrErr.first->getBufferIdentifier());
-    // Pass empty string as archive name so that the original filename is
-    // used as the buffer identifier.
-    ctx.driver.addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
-                                toCOFFString(ctx, sym), "",
-                                /*OffsetInArchive=*/0);
+    ctx.driver.addThinArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
+                                    toCOFFString(ctx, sym));
   });
 }
 
@@ -1643,8 +1653,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         config->warnLocallyDefinedImported = false;
       else if (s == "longsections")
         config->warnLongSectionNames = false;
-      else if (s == "exporteddllmain")
-        config->warnExportedDllMain = false;
+      else if (s == "importeddllmain")
+        config->warnImportedDllMain = false;
       // Other warning numbers are ignored.
     }
   }
@@ -2088,6 +2098,23 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Fatal(ctx) << "/manifestinput: requires /manifest:embed";
   }
 
+  // Handle /thinlto-distributor:<path>
+  config->dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor);
+
+  // Handle /thinlto-distributor-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_distributor_arg))
+    config->dtltoDistributorArgs.push_back(arg->getValue());
+
+  // Handle /thinlto-remote-compiler:<path>
+  config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler);
+  if (!config->dtltoDistributor.empty() && config->dtltoCompiler.empty())
+    Err(ctx) << "A value must be specified for /thinlto-remote-compiler if "
+                "/thinlto-distributor is specified.";
+
+  // Handle /thinlto-remote-compiler-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_compiler_arg))
+    config->dtltoCompilerArgs.push_back(arg->getValue());
+
   // Handle /dwodir
   config->dwoDir = args.getLastArgValue(OPT_dwodir);
 
@@ -2527,28 +2554,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
             e.symbolName = symtab.mangleMaybe(e.sym);
         }
 
-        // Add weak aliases. Weak aliases is a mechanism to give remaining
-        // undefined symbols final chance to be resolved successfully.
-        for (auto pair : symtab.alternateNames) {
-          StringRef from = pair.first;
-          StringRef to = pair.second;
-          Symbol *sym = symtab.find(from);
-          if (!sym)
-            continue;
-          if (auto *u = dyn_cast<Undefined>(sym)) {
-            if (u->weakAlias) {
-              // On ARM64EC, anti-dependency aliases are treated as undefined
-              // symbols unless a demangled symbol aliases a defined one, which
-              // is part of the implementation.
-              if (!symtab.isEC() || !u->isAntiDep)
-                continue;
-              if (!isa<Undefined>(u->weakAlias) &&
-                  !isArm64ECMangledFunctionName(u->getName()))
-                continue;
-            }
-            u->setWeakAlias(symtab.addUndefined(to));
-          }
-        }
+        symtab.resolveAlternateNames();
       });
 
       ctx.forEachActiveSymtab([&](SymbolTable &symtab) {
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index 14c97a98875b..5a9bd5c6d968 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -173,6 +173,7 @@ class LinkerDriver {
                  bool lazy);
   void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName,
                         StringRef parentName, uint64_t offsetInArchive);
+  void addThinArchiveBuffer(MemoryBufferRef mbref, StringRef symName);
 
   void enqueueTask(std::function<void()> task);
   bool run();
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 0b7dbea8cdd9..2a6b63cbacca 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -117,8 +117,6 @@ static coff_symbol_generic *cloneSymbol(COFFSymbolRef sym) {
 // Skip importing DllMain thunks from import libraries.
 static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
                          const Archive::Symbol &sym, bool &skipDllMain) {
-  if (skipDllMain)
-    return true;
   const Archive::Child &c =
       CHECK(sym.getMember(), file->getFileName() +
                                  ": could not get the member for symbol " +
@@ -128,13 +126,13 @@ static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
             file->getFileName() +
                 ": could not get the buffer for a child buffer of the archive");
   if (identify_magic(mb.getBuffer()) == file_magic::coff_import_library) {
-    if (ctx.config.warnExportedDllMain) {
+    if (ctx.config.warnImportedDllMain) {
       // We won't place DllMain symbols in the symbol table if they are
       // coming from a import library. This message can be ignored with the flag
-      // '/ignore:exporteddllmain'
+      // '/ignore:importeddllmain'
       Warn(ctx)
           << file->getFileName()
-          << ": skipping exported DllMain symbol [exporteddllmain]\nNOTE: this "
+          << ": skipping imported DllMain symbol [importeddllmain]\nNOTE: this "
              "might be a mistake when the DLL/library was produced.";
     }
     skipDllMain = true;
@@ -204,14 +202,24 @@ void ArchiveFile::parse() {
     }
   }
 
-  // Read the symbol table to construct Lazy objects.
   bool skipDllMain = false;
+  StringRef mangledDllMain, impMangledDllMain;
+
+  // The calls below will fail if we haven't set the machine type yet. Instead
+  // of failing, it is preferable to skip this "imported DllMain" check if we
+  // don't know the machine type at this point.
+  if (!file->isEmpty() && ctx.config.machine != IMAGE_FILE_MACHINE_UNKNOWN) {
+    mangledDllMain = archiveSymtab->mangle("DllMain");
+    impMangledDllMain = uniqueSaver().save("__imp_" + mangledDllMain);
+  }
+
+  // Read the symbol table to construct Lazy objects.
   for (const Archive::Symbol &sym : file->symbols()) {
-    // If the DllMain symbol was exported by mistake, skip importing it
-    // otherwise we might end up with a import thunk in the final binary which
-    // is wrong.
-    if (sym.getName() == "__imp_DllMain" || sym.getName() == "DllMain") {
-      if (fixupDllMain(ctx, file.get(), sym, skipDllMain))
+    // If an import library provides the DllMain symbol, skip importing it, as
+    // we should be using our own DllMain, not another DLL's DllMain.
+    if (!mangledDllMain.empty() && (sym.getName() == mangledDllMain ||
+                                    sym.getName() == impMangledDllMain)) {
+      if (skipDllMain || fixupDllMain(ctx, file.get(), sym, skipDllMain))
         continue;
     }
     archiveSymtab->addLazyArchive(this, sym);
diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 2a4d07cc2d01..1050874a1b10 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -110,7 +110,16 @@ BitcodeCompiler::BitcodeCompiler(COFFLinkerContext &c) : ctx(c) {
 
   // Initialize ltoObj.
   lto::ThinBackend backend;
-  if (ctx.config.thinLTOIndexOnly) {
+  if (!ctx.config.dtltoDistributor.empty()) {
+    backend = lto::createOutOfProcessThinBackend(
+        llvm::hardware_concurrency(ctx.config.thinLTOJobs),
+        /*OnWrite=*/nullptr,
+        /*ShouldEmitIndexFiles=*/false,
+        /*ShouldEmitImportFiles=*/false, ctx.config.outputFile,
+        ctx.config.dtltoDistributor, ctx.config.dtltoDistributorArgs,
+        ctx.config.dtltoCompiler, ctx.config.dtltoCompilerArgs,
+        !ctx.config.saveTempsArgs.empty());
+  } else if (ctx.config.thinLTOIndexOnly) {
     auto OnIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
     backend = lto::createWriteIndexesThinBackend(
         llvm::hardware_concurrency(ctx.config.thinLTOJobs),
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index a887d7d351e1..2a82fb5cd884 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -270,6 +270,17 @@ def thinlto_object_suffix_replace : P<
 def thinlto_prefix_replace: P<
     "thinlto-prefix-replace",
     "'old;new' replace old prefix with new prefix in ThinLTO outputs">;
+def thinlto_distributor : P<"thinlto-distributor",
+  "Distributor to use for ThinLTO backend compilations. If specified, ThinLTO "
+  "backend compilations will be distributed">;
+def thinlto_distributor_arg : P<"thinlto-distributor-arg",
+  "Arguments to pass to the ThinLTO distributor">;
+def thinlto_compiler : P<"thinlto-remote-compiler",
+  "Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
+  "compilations">;
+def thinlto_compiler_arg : P<"thinlto-remote-compiler-arg",
+  "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
+  "compilations">;
 def lto_obj_path : P<
     "lto-obj-path",
     "output native object for merged LTO unit to this path">;
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index a54ea403ba2e..94eeae279797 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -1135,9 +1135,12 @@ static pdb::BulkPublic createPublic(COFFLinkerContext &ctx, Defined *def) {
   pub.setFlags(flags);
 
   OutputSection *os = ctx.getOutputSection(def->getChunk());
-  assert(os && "all publics should be in final image");
-  pub.Offset = def->getRVA() - os->getRVA();
-  pub.Segment = os->sectionIndex;
+  assert((os || !def->getChunk()->getSize()) &&
+         "all publics should be in final image");
+  if (os) {
+    pub.Offset = def->getRVA() - os->getRVA();
+    pub.Segment = os->sectionIndex;
+  }
   return pub;
 }
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 0062df5820e6..d15e0c24410c 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -1344,6 +1344,44 @@ void SymbolTable::parseAlternateName(StringRef s) {
   alternateNames.insert(it, std::make_pair(from, to));
 }
 
+void SymbolTable::resolveAlternateNames() {
+  // Add weak aliases. Weak aliases is a mechanism to give remaining
+  // undefined symbols final chance to be resolved successfully.
+  for (auto pair : alternateNames) {
+    StringRef from = pair.first;
+    StringRef to = pair.second;
+    Symbol *sym = find(from);
+    if (!sym)
+      continue;
+    if (auto *u = dyn_cast<Undefined>(sym)) {
+      if (u->weakAlias) {
+        // On ARM64EC, anti-dependency aliases are treated as undefined
+        // symbols unless a demangled symbol aliases a defined one, which
+        // is part of the implementation.
+        if (!isEC() || !u->isAntiDep)
+          continue;
+        if (!isa<Undefined>(u->weakAlias) &&
+            !isArm64ECMangledFunctionName(u->getName()))
+          continue;
+      }
+
+      // Check if the destination symbol is defined. If not, skip it.
+      // It may still be resolved later if more input files are added.
+      // Also skip anti-dependency targets, as they can't be chained anyway.
+      Symbol *toSym = find(to);
+      if (!toSym)
+        continue;
+      auto toUndef = dyn_cast<Undefined>(toSym);
+      if (toUndef && (!toUndef->weakAlias || toUndef->isAntiDep))
+        continue;
+      toSym->isUsedInRegularObj = true;
+      if (toSym->isLazy())
+        forceLazy(toSym);
+      u->setWeakAlias(toSym);
+    }
+  }
+}
+
 // Parses /aligncomm option argument.
 void SymbolTable::parseAligncomm(StringRef s) {
   auto [name, align] = s.split(',');
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 15e2644a6f51..7eb067640dc8 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -69,6 +69,9 @@ class SymbolTable {
   // symbols and warn about imported local symbols.
   void resolveRemainingUndefines();
 
+  // Try to resolve undefined symbols with alternate names.
+  void resolveAlternateNames();
+
   // Load lazy objects that are needed for MinGW automatic import and for
   // doing stdcall fixups.
   void loadMinGWSymbols();
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 479131a24dcf..9b33e78731c9 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "Thunks.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -36,6 +37,10 @@ class Hexagon final : public TargetInfo {
                      const uint8_t *loc) const override;
   RelType getDynRel(RelType type) const override;
   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+  bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
+  bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
   void writePltHeader(uint8_t *buf) const override;
@@ -63,6 +68,8 @@ Hexagon::Hexagon(Ctx &ctx) : TargetInfo(ctx) {
   tlsGotRel = R_HEX_TPREL_32;
   tlsModuleIndexRel = R_HEX_DTPMOD_32;
   tlsOffsetRel = R_HEX_DTPREL_32;
+
+  needsThunks = true;
 }
 
 uint32_t Hexagon::calcEFlags() const {
@@ -258,6 +265,46 @@ static uint32_t findMaskR16(Ctx &ctx, uint32_t insn) {
 
 static void or32le(uint8_t *p, int32_t v) { write32le(p, read32le(p) | v); }
 
+bool Hexagon::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
+  int64_t offset = dst - src;
+  switch (type) {
+  case llvm::ELF::R_HEX_B22_PCREL:
+  case llvm::ELF::R_HEX_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_GD_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_LD_PLT_B22_PCREL:
+    return llvm::isInt<22>(offset >> 2);
+  case llvm::ELF::R_HEX_B15_PCREL:
+    return llvm::isInt<15>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B13_PCREL:
+    return llvm::isInt<13>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B9_PCREL:
+    return llvm::isInt<9>(offset >> 2);
+  default:
+    return true;
+  }
+  llvm_unreachable("unsupported relocation");
+}
+
+bool Hexagon::needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                         uint64_t branchAddr, const Symbol &s,
+                         int64_t a) const {
+  // Only check branch range for supported branch relocation types
+  switch (type) {
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+  case R_HEX_LD_PLT_B22_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B9_PCREL:
+    return !ctx.target->inBranchRange(type, branchAddr, s.getVA(ctx, a));
+  default:
+    return false;
+  }
+}
+
 void Hexagon::relocate(uint8_t *loc, const Relocation &rel,
                        uint64_t val) const {
   switch (rel.type) {
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index fe804cbb0e69..8802c8c2e7f0 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+                     const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -809,10 +811,13 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i,
   // address.
   // Meanwhile skip undefined, preemptible and STT_GNU_IFUNC symbols, because
   // these symbols may be resolve in runtime.
+  // Moreover, relaxation can only occur if the addends of both relocations are
+  // zero for GOT references.
   if (rHi20.type == R_LARCH_GOT_PC_HI20 &&
-      (!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
-       rHi20.sym->isGnuIFunc() ||
-       (ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section)))
+      (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+       rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc() ||
+       (ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section) ||
+       rHi20.addend != 0 || rLo12.addend != 0))
     return;
 
   uint64_t dest = 0;
@@ -966,10 +971,16 @@ static bool relax(Ctx &ctx, InputSection &sec) {
     case R_LARCH_GOT_PC_HI20:
     case R_LARCH_TLS_GD_PC_HI20:
     case R_LARCH_TLS_LD_PC_HI20:
-    case R_LARCH_TLS_DESC_PC_HI20:
       // The overflow check for i+2 will be carried out in isPairRelaxable.
-      if (r.expr != RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC &&
-          r.expr != R_RELAX_TLS_GD_TO_LE && isPairRelaxable(relocs, i))
+      if (isPairRelaxable(relocs, i))
+        relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
+      break;
+    case R_LARCH_TLS_DESC_PC_HI20:
+      if (r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
+          r.expr == R_RELAX_TLS_GD_TO_LE) {
+        if (relaxable(relocs, i))
+          remove = 4;
+      } else if (isPairRelaxable(relocs, i))
         relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
       break;
     case R_LARCH_CALL36:
@@ -987,6 +998,17 @@ static bool relax(Ctx &ctx, InputSection &sec) {
           isUInt<12>(r.sym->getVA(ctx, r.addend)))
         remove = 4;
       break;
+    case R_LARCH_TLS_DESC_PC_LO12:
+      if (relaxable(relocs, i) &&
+          (r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
+           r.expr == R_RELAX_TLS_GD_TO_LE))
+        remove = 4;
+      break;
+    case R_LARCH_TLS_DESC_LD:
+      if (relaxable(relocs, i) && r.expr == R_RELAX_TLS_GD_TO_LE &&
+          isUInt<12>(r.sym->getVA(ctx, r.addend)))
+        remove = 4;
+      break;
     }
 
     // For all anchors whose offsets are <= r.offset, they are preceded by
@@ -1135,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d    $a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+                              const Relocation &rLo12, uint64_t secAddr) const {
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+    return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+      rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+    return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
+    return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+    return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe000000) != PCALAU12I ||
+      (nextInsn & 0xffc00000) != ldOpcode)
+    return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+    return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x80000000LL - 0x800;
+  const int64_t overflow = 0x80000000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+    return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
+                         rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+                         &sym};
+  uint64_t pageDelta =
+      getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
+                          getJ5(nextInsn), 0));
+  relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
+  return true;
+}
+
 // During TLSDESC GD_TO_IE, the converted code sequence always includes an
 // instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
 // in `getRelocTargetVA`, expr of this instruction should be adjusted to
@@ -1152,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
   return expr;
 }
 
+static bool pairForGotRels(ArrayRef<Relocation> relocs) {
+  // Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
+  // pairs.
+  size_t i = 0;
+  const size_t size = relocs.size();
+  for (; i != size; ++i) {
+    if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
+      if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
+        ++i;
+        continue;
+      }
+      if (relaxable(relocs, i) && i + 2 < size &&
+          relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
+        i += 2;
+        continue;
+      }
+      break;
+    } else if (relocs[i].type == R_LARCH_GOT_PC_LO12) {
+      break;
+    }
+  }
+  return i == size;
+}
+
 void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   const unsigned bits = ctx.arg.is64 ? 64 : 32;
   uint64_t secAddr = sec.getOutputSection()->addr;
@@ -1161,6 +1279,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
     secAddr += ehIn->getParent()->outSecOff;
   bool isExtreme = false, isRelax = false;
   const MutableArrayRef<Relocation> relocs = sec.relocs();
+  const bool isPairForGotRels = pairForGotRels(relocs);
   for (size_t i = 0, size = relocs.size(); i != size; ++i) {
     Relocation &rel = relocs[i];
     uint8_t *loc = buf + rel.offset;
@@ -1216,6 +1335,10 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
                            bits);
         relocateNoSym(loc, rel.type, val);
       } else {
+        isRelax = relaxable(relocs, i);
+        if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
+                        rel.type == R_LARCH_TLS_DESC_PC_LO12))
+          continue;
         tlsdescToIe(loc, rel, val);
       }
       continue;
@@ -1232,9 +1355,32 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
                            bits);
         relocateNoSym(loc, rel.type, val);
       } else {
+        isRelax = relaxable(relocs, i);
+        if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
+                        rel.type == R_LARCH_TLS_DESC_PC_LO12 ||
+                        (rel.type == R_LARCH_TLS_DESC_LD && isUInt<12>(val))))
+          continue;
         tlsdescToLe(loc, rel, val);
       }
       continue;
+    case RE_LOONGARCH_GOT_PAGE_PC:
+      // In LoongArch, we try GOT indirection to PC relative optimization in
+      // normal or medium code model, whether or not with R_LARCH_RELAX
+      // relocation. Moreover, if the original code sequence can be relaxed to a
+      // single instruction `pcaddi`, the first instruction will be removed and
+      // it will not reach here.
+      if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) {
+        bool isRelax = relaxable(relocs, i);
+        const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1];
+        if (lo12Rel.type == R_LARCH_GOT_PC_LO12 &&
+            tryGotToPCRel(loc, rel, lo12Rel, secAddr)) {
+          // isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12
+          // !isRelax: skip relocation R_LARCH_GOT_PC_LO12
+          i += isRelax ? 2 : 1;
+          continue;
+        }
+      }
+      break;
     default:
       break;
     }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 71e72e7184b9..62b54f8217bc 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
@@ -1753,6 +1754,39 @@ static uint8_t getOsAbi(const Triple &t) {
   }
 }
 
+// For DTLTO, bitcode member names must be valid paths to files on disk.
+// For thin archives, resolve `memberPath` relative to the archive's location.
+// Returns true if adjusted; false otherwise. Non-thin archives are unsupported.
+static bool dtltoAdjustMemberPathIfThinArchive(Ctx &ctx, StringRef archivePath,
+                                               std::string &memberPath) {
+  assert(!archivePath.empty());
+
+  if (ctx.arg.dtltoDistributor.empty())
+    return false;
+
+  // Read the archive header to determine if it's a thin archive.
+  auto bufferOrErr =
+      MemoryBuffer::getFileSlice(archivePath, sizeof(ThinArchiveMagic) - 1, 0);
+  if (std::error_code ec = bufferOrErr.getError()) {
+    ErrAlways(ctx) << "cannot open " << archivePath << ": " << ec.message();
+    return false;
+  }
+
+  if (!bufferOrErr->get()->getBuffer().starts_with(ThinArchiveMagic))
+    return false;
+
+  SmallString<128> resolvedPath;
+  if (path::is_relative(memberPath)) {
+    resolvedPath = path::parent_path(archivePath);
+    path::append(resolvedPath, memberPath);
+  } else
+    resolvedPath = memberPath;
+
+  path::remove_dots(resolvedPath, /*remove_dot_dot=*/true);
+  memberPath = resolvedPath.str();
+  return true;
+}
+
 BitcodeFile::BitcodeFile(Ctx &ctx, MemoryBufferRef mb, StringRef archiveName,
                          uint64_t offsetInArchive, bool lazy)
     : InputFile(ctx, BitcodeKind, mb) {
@@ -1763,17 +1797,22 @@ BitcodeFile::BitcodeFile(Ctx &ctx, MemoryBufferRef mb, StringRef archiveName,
   if (ctx.arg.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(ctx, mb.getBufferIdentifier());
 
-  // ThinLTO assumes that all MemoryBufferRefs given to it have a unique
-  // name. If two archives define two members with the same name, this
-  // causes a collision which result in only one of the objects being taken
-  // into consideration at LTO time (which very likely causes undefined
-  // symbols later in the link stage). So we append file offset to make
-  // filename unique.
   StringSaver &ss = ctx.saver;
-  StringRef name = archiveName.empty()
-                       ? ss.save(path)
-                       : ss.save(archiveName + "(" + path::filename(path) +
-                                 " at " + utostr(offsetInArchive) + ")");
+  StringRef name;
+  if (archiveName.empty() ||
+      dtltoAdjustMemberPathIfThinArchive(ctx, archiveName, path)) {
+    name = ss.save(path);
+  } else {
+    // ThinLTO assumes that all MemoryBufferRefs given to it have a unique
+    // name. If two archives define two members with the same name, this
+    // causes a collision which result in only one of the objects being taken
+    // into consideration at LTO time (which very likely causes undefined
+    // symbols later in the link stage). So we append file offset to make
+    // filename unique.
+    name = ss.save(archiveName + "(" + path::filename(path) + " at " +
+                   utostr(offsetInArchive) + ")");
+  }
+
   MemoryBufferRef mbref(mb.getBuffer(), name);
 
   obj = CHECK2(lto::InputFile::create(mbref), this);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index cebd564036b2..608cdd0d2666 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -2139,19 +2139,45 @@ void ThunkCreator::mergeThunks(ArrayRef<OutputSection *> outputSections) {
       });
 }
 
-static int64_t getPCBias(Ctx &ctx, RelType type) {
-  if (ctx.arg.emachine != EM_ARM)
-    return 0;
-  switch (type) {
-  case R_ARM_THM_JUMP19:
-  case R_ARM_THM_JUMP24:
-  case R_ARM_THM_CALL:
-    return 4;
-  default:
-    return 8;
+constexpr uint32_t HEXAGON_MASK_END_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_DUPLEX = 0 << 14;
+
+// Return the distance between the packet start and the instruction in the
+// relocation.
+static int getHexagonPacketOffset(const InputSection &isec,
+                                  const Relocation &rel) {
+  const ArrayRef<uint8_t> data = isec.content();
+
+  // Search back as many as 3 instructions.
+  for (unsigned i = 0;; i++) {
+    if (i == 3 || rel.offset < (i + 1) * 4)
+      return i * 4;
+    uint32_t instWord =
+        read32(isec.getCtx(), data.data() + (rel.offset - (i + 1) * 4));
+    if (((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_PACKET) ||
+        ((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_DUPLEX))
+      return i * 4;
   }
 }
 
+static int64_t getPCBias(Ctx &ctx, const InputSection &isec,
+                         const Relocation &rel) {
+  if (ctx.arg.emachine == EM_ARM) {
+    switch (rel.type) {
+    case R_ARM_THM_JUMP19:
+    case R_ARM_THM_JUMP24:
+    case R_ARM_THM_CALL:
+      return 4;
+    default:
+      return 8;
+    }
+  }
+  if (ctx.arg.emachine == EM_HEXAGON)
+    return -getHexagonPacketOffset(isec, rel);
+  return 0;
+}
+
 // Find or create a ThunkSection within the InputSectionDescription (ISD) that
 // is in range of Src. An ISD maps to a range of InputSections described by a
 // linker script section pattern such as { .text .text.* }.
@@ -2161,7 +2187,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os,
                                            const Relocation &rel,
                                            uint64_t src) {
   // See the comment in getThunk for -pcBias below.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   for (std::pair<ThunkSection *, uint32_t> tp : isd->thunkSections) {
     ThunkSection *ts = tp.first;
     uint64_t tsBase = os->addr + ts->outSecOff - pcBias;
@@ -2322,7 +2348,7 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
   // out in the relocation addend. We compensate for the PC bias so that
   // an Arm and Thumb relocation to the same destination get the same keyAddend,
   // which is usually 0.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   const int64_t keyAddend = rel.addend + pcBias;
 
   // We use a ((section, offset), addend) pair to find the thunk position if
@@ -2481,7 +2507,7 @@ bool ThunkCreator::createThunks(uint32_t pass,
             // STT_SECTION + non-zero addend, clear the addend after
             // redirection.
             if (ctx.arg.emachine != EM_MIPS)
-              rel.addend = -getPCBias(ctx, rel.type);
+              rel.addend = -getPCBias(ctx, *isec, rel);
           }
 
         for (auto &p : isd->thunkSections)
@@ -2525,7 +2551,8 @@ void elf::hexagonTLSSymbolUpdate(Ctx &ctx) {
           for (Relocation &rel : isec->relocs())
             if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) {
               if (needEntry) {
-                sym->allocateAux(ctx);
+                if (sym->auxIdx == 0)
+                  sym->allocateAux(ctx);
                 addPltEntry(ctx, *ctx.in.plt, *ctx.in.gotPlt, *ctx.in.relaPlt,
                             ctx.target->pltRel, *sym);
                 needEntry = false;
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index c26ba76bccb7..65d0f094c43c 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -415,6 +415,22 @@ class AVRThunk : public Thunk {
   void addSymbols(ThunkSection &isec) override;
 };
 
+// Hexagon CPUs need thunks for R_HEX_B{9,1{3,5},22}_PCREL,
+// R_HEX_{,GD_}PLT_B22_PCREL when their destination is out of
+// range.
+class HexagonThunk : public Thunk {
+public:
+  HexagonThunk(Ctx &ctx, const InputSection &isec, Relocation &rel,
+               Symbol &dest)
+      : Thunk(ctx, dest, 0), relOffset(rel.offset) {
+    alignment = 4;
+  }
+  uint32_t relOffset;
+  uint32_t size() override { return ctx.arg.isPic ? 12 : 8; }
+  void writeTo(uint8_t *buf) override;
+  void addSymbols(ThunkSection &isec) override;
+};
+
 // MIPS LA25 thunk
 class MipsThunk final : public Thunk {
 public:
@@ -1519,6 +1535,39 @@ bool PPC64LongBranchThunk::isCompatibleWith(const InputSection &isec,
   return rel.type == R_PPC64_REL24 || rel.type == R_PPC64_REL14;
 }
 
+// Hexagon Target Thunks
+static uint64_t getHexagonThunkDestVA(Ctx &ctx, const Symbol &s, int64_t a) {
+  uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(ctx, a);
+  return SignExtend64<32>(v);
+}
+
+void HexagonThunk::writeTo(uint8_t *buf) {
+  uint64_t s = getHexagonThunkDestVA(ctx, destination, addend);
+  uint64_t p = getThunkTargetSym()->getVA(ctx);
+
+  if (ctx.arg.isPic) {
+    write32(ctx, buf + 0, 0x00004000); // {  immext(#0)
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x6a49c00e); //    r14 = add(pc,##0) }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_6_PCREL_X, s - p);
+
+    write32(ctx, buf + 8, 0x528ec000); // {  jumpr r14 }
+  } else {
+    write32(ctx, buf + 0, 0x00004000); //  { immext
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x5800c000); //    jump <> }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_B22_PCREL_X, s - p);
+  }
+}
+void HexagonThunk::addSymbols(ThunkSection &isec) {
+  Symbol *enclosing = isec.getEnclosingSymbol(relOffset);
+  StringRef src = enclosing ? enclosing->getName() : isec.name;
+
+  addSymbol(
+      saver().save("__hexagon_thunk_" + destination.getName() + "_from_" + src),
+      STT_FUNC, 0, isec);
+}
+
 Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a)
     : ctx(ctx), destination(d), addend(a), offset(0) {
   destination.thunkAccessed = true;
@@ -1692,6 +1741,24 @@ static std::unique_ptr<Thunk> addThunkAVR(Ctx &ctx, RelType type, Symbol &s,
   }
 }
 
+static std::unique_ptr<Thunk> addThunkHexagon(Ctx &ctx,
+                                              const InputSection &isec,
+                                              Relocation &rel, Symbol &s) {
+  switch (rel.type) {
+  case R_HEX_B9_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+    return std::make_unique<HexagonThunk>(ctx, isec, rel, s);
+  default:
+    Fatal(ctx) << "unrecognized relocation " << rel.type << " to " << &s
+               << " for hexagon target";
+    llvm_unreachable("");
+  }
+}
+
 static std::unique_ptr<Thunk> addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
   if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6(ctx))
     return std::make_unique<MicroMipsR6Thunk>(ctx, s);
@@ -1761,8 +1828,11 @@ std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
     return addThunkPPC32(ctx, isec, rel, s);
   case EM_PPC64:
     return addThunkPPC64(ctx, rel.type, s, a);
+  case EM_HEXAGON:
+    return addThunkHexagon(ctx, isec, rel, s);
   default:
-    llvm_unreachable("add Thunk only supported for ARM, AVR, Mips and PowerPC");
+    llvm_unreachable(
+        "add Thunk only supported for ARM, AVR, Hexagon, Mips and PowerPC");
   }
 }
 
diff --git a/lld/docs/DTLTO.rst b/lld/docs/DTLTO.rst
index 985decf6c7db..54fcc034d137 100644
--- a/lld/docs/DTLTO.rst
+++ b/lld/docs/DTLTO.rst
@@ -7,8 +7,7 @@ during the traditional link step.
 
 The implementation is documented here: https://llvm.org/docs/DTLTO.html.
 
-Currently, DTLTO is only supported in ELF LLD. Support will be added to other
-LLD flavours in the future.
+Currently, DTLTO is only supported in ELF and COFF LLD.
 
 ELF LLD
 -------
@@ -40,3 +39,37 @@ The command-line interface is as follows:
 Some LLD LTO options (e.g., ``--lto-sample-profile=<file>``) are supported.
 Currently, other options are silently accepted but do not have the intended
 effect. Support for such options will be expanded in the future.
+
+COFF LLD
+--------
+
+The command-line interface is as follows:
+
+- ``/thinlto-distributor:<path>``
+  Specifies the file to execute as the distributor process. If specified,
+  ThinLTO backend compilations will be distributed.
+
+- ``/thinlto-remote-compiler:<path>``
+  Specifies the path to the compiler that the distributor process will use for
+  backend compilations. The compiler invoked must match the version of LLD.
+
+- ``/thinlto-distributor-arg:<arg>``
+  Specifies ``<arg>`` on the command line when invoking the distributor.
+  Can be specified multiple times.
+
+- ``/thinlto-remote-compiler-arg:<arg>``
+  Appends ``<arg>`` to the remote compiler's command line.
+  Can be specified multiple times.
+
+  Options that introduce extra input/output files may cause miscompilation if
+  the distribution system does not automatically handle pushing/fetching them to
+  remote nodes. In such cases, configure the distributor - possibly using
+  ``/thinlto-distributor-arg:`` - to manage these dependencies. See the
+  distributor documentation for details.
+
+Some LLD LTO options (e.g., ``/lto-sample-profile:<file>``) are supported.
+Currently, other options are silently accepted but do not have the intended
+effect. Support for such options could be expanded in the future.
+
+Currently, there is no DTLTO command line interface supplied for ``clang-cl``,
+as users are expected to invoke LLD directly.
\ No newline at end of file
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 863b20189100..03671c6f3b6f 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -92,6 +92,9 @@ Breaking changes
 
 COFF Improvements
 -----------------
+* ``/thinlto-distributor`` and ``/thinlto-remote-compiler`` options
+  added to support Integrated Distributed ThinLTO.
+  (`#147265 <https://github.com/llvm/llvm-project/pull/147265>`_)
 
 MinGW Improvements
 ------------------
diff --git a/lld/test/COFF/alternatename-alias.s b/lld/test/COFF/alternatename-alias.s
new file mode 100644
index 000000000000..bd0a861380e9
--- /dev/null
+++ b/lld/test/COFF/alternatename-alias.s
@@ -0,0 +1,15 @@
+// REQUIRES: x86
+
+// Check that a weak alias can be used as an alternate name target.
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: lld-link -dll -noentry %t.obj -alternatename:sym=altsym
+
+        .data
+        .rva sym
+
+        .weak altsym
+        .set altsym,a
+
+        .globl a
+a:
+        .word 1
diff --git a/lld/test/COFF/alternatename-antidep.s b/lld/test/COFF/alternatename-antidep.s
new file mode 100644
index 000000000000..1188a9b75d48
--- /dev/null
+++ b/lld/test/COFF/alternatename-antidep.s
@@ -0,0 +1,16 @@
+// REQUIRES: x86
+
+// Check that an anti-dependency alias can't be used as an alternate name target.
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: not lld-link -dll -noentry %t.obj -alternatename:sym=altsym 2>&1 | FileCheck %s
+// CHECK: error: undefined symbol: sym
+
+        .data
+        .rva sym
+
+        .weak_anti_dep altsym
+        .set altsym,a
+
+        .globl a
+a:
+        .word 1
diff --git a/lld/test/COFF/alternatename-lib.s b/lld/test/COFF/alternatename-lib.s
new file mode 100644
index 000000000000..206fe6bc2397
--- /dev/null
+++ b/lld/test/COFF/alternatename-lib.s
@@ -0,0 +1,43 @@
+// REQUIRES: x86
+// RUN: split-file %s %t.dir && cd %t.dir
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows refab.s -o refab.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows aa.s -o aa.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows b.s -o b.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows antidep.s -o antidep.obj
+// RUN: llvm-lib -out:aa.lib aa.obj
+// RUN: llvm-lib -out:b.lib b.obj
+
+// Check that -alternatename with an undefined target does not prevent the symbol from being resolved to a library,
+// once another alternate name is resolved and pulls in the source symbol.
+// RUN: lld-link -out:out.dll -dll -noentry -machine:amd64 refab.obj aa.lib -alternatename:a=aa -alternatename:b=undef
+
+// Check that -alternatename with an anti-dependency target does not prevent the symbol from being resolved to a library,
+// after another alternate name is resolved and pulls in the source symbol.
+// RUN: lld-link -out:out2.dll -dll -noentry -machine:amd64 antidep.obj refab.obj aa.lib -alternatename:a=aa -alternatename:b=u
+
+#--- refab.s
+        .data
+        .rva a
+        .rva b
+
+#--- aa.s
+        .globl aa
+aa:
+        .word 1
+
+        .section .drectve, "yn"
+        .ascii "/defaultlib:b.lib"
+
+#--- b.s
+        .globl b
+b:
+        .word 2
+
+#--- antidep.s
+        .weak_anti_dep u
+        .set u,d
+
+        .globl d
+d:
+        .word 3
diff --git a/lld/test/COFF/alternatename-lto.ll b/lld/test/COFF/alternatename-lto.ll
new file mode 100644
index 000000000000..c3666cd3501d
--- /dev/null
+++ b/lld/test/COFF/alternatename-lto.ll
@@ -0,0 +1,25 @@
+; REQUIRES: x86
+; RUN: mkdir -p %t.dir
+; RUN: llvm-as -o %t.obj %s
+; RUN: lld-link -out:%t.dll -dll -noentry %t.obj -export:test
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc19.33.0"
+
+$alt = comdat any
+
+@alt = weak_odr dso_local global i32 0, comdat, align 4
+@ext = external dso_local global i32, align 4
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @test() #0 {
+entry:
+  %0 = load i32, ptr @ext, align 4
+  ret i32 %0
+}
+
+attributes #0 = { noinline nounwind optnone uwtable }
+
+!llvm.linker.options = !{!0}
+
+!0 = !{!"/alternatename:ext=alt"}
diff --git a/lld/test/COFF/arm64ec-altnames.s b/lld/test/COFF/arm64ec-altnames.s
index b2abb24efe4c..cca778ab8dc6 100644
--- a/lld/test/COFF/arm64ec-altnames.s
+++ b/lld/test/COFF/arm64ec-altnames.s
@@ -2,6 +2,7 @@ REQUIRES: aarch64
 RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ext.s -o ext.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ext-mangled.s -o ext-mangled.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows impl.s -o impl.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows impl-cpp.s -o impl-cpp.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig.obj
@@ -49,6 +50,20 @@ RUN: lld-link -machine:arm64ec -dll -noentry -out:out4.dll impl-cpp.obj loadconf
 RUN: llvm-objdump -d out4.dll | FileCheck --check-prefix=DISASM %s
 RUN: llvm-readobj --hex-dump=.test out4.dll | FileCheck --check-prefix=TESTSEC %s
 
+# Check that when both mangled and demangled alternate names are used,
+# only the one whose target is defined is used (the mangled one in this case).
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out5.dll ext-mangled.obj loadconfig.obj "-alternatename:#func=#altsym" -alternatename:func=altsym
+RUN: llvm-objdump -d out5.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-readobj --hex-dump=.test out5.dll | FileCheck --check-prefix=TESTSEC %s
+
+# Check that when both mangled and demangled alternate names are used,
+# only the one whose target is defined is used (the demangled one in this case).
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out6.dll ext.obj loadconfig.obj "-alternatename:#func=#altsym" -alternatename:func=altsym
+RUN: llvm-objdump -d out6.dll | FileCheck --check-prefix=DISASM2 %s
+RUN: llvm-readobj --hex-dump=.test out6.dll | FileCheck --check-prefix=TESTSEC2 %s
+
 #--- ext.s
         .weak_anti_dep func
 .set func, "#func"
@@ -70,6 +85,30 @@ altsym:
         mov w0, #1
         ret
 
+#--- ext-mangled.s
+        .weak_anti_dep func
+.set func, "#func"
+        .weak_anti_dep "#func"
+.set "#func", thunksym
+
+        .section .test, "r"
+        .rva func
+        .rva "#func"
+
+        .section .thnk,"xr",discard,thunksym
+thunksym:
+        mov w0, #2
+        ret
+
+        .section .text,"xr",discard,"#altsym"
+        .globl "#altsym"
+"#altsym":
+        mov w0, #1
+        ret
+
+        .weak_anti_dep altsym
+        .set altsym,"#altsym"
+
 #--- impl.s
         .weak_anti_dep func
 .set func, "#func"
diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test
index 1e0bd899ba32..01d4ab89982e 100644
--- a/lld/test/COFF/arm64ec-delayimport.test
+++ b/lld/test/COFF/arm64ec-delayimport.test
@@ -51,28 +51,28 @@ DISASM-NEXT: 180002016: 48 8d 05 6b 50 00 00         leaq    0x506b(%rip), %rax
 DISASM-NEXT: 18000201d: e9 0c 00 00 00               jmp     0x18000202e <.text+0x102e>
 DISASM-NEXT: 180002022: 48 8d 05 67 50 00 00         leaq    0x5067(%rip), %rax      # 0x180007090
 DISASM-NEXT: 180002029: e9 00 00 00 00               jmp     0x18000202e <.text+0x102e>
-DISASM-NEXT: 18000202e: 51                           pushq   %rcx
-DISASM-NEXT: 18000202f: 52                           pushq   %rdx
-DISASM-NEXT: 180002030: 41 50                        pushq   %r8
-DISASM-NEXT: 180002032: 41 51                        pushq   %r9
-DISASM-NEXT: 180002034: 48 83 ec 48                  subq    $0x48, %rsp
-DISASM-NEXT: 180002038: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-DISASM-NEXT: 18000203d: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-DISASM-NEXT: 180002043: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-DISASM-NEXT: 180002049: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-DISASM-NEXT: 18000204f: 48 8b d0                     movq    %rax, %rdx
-DISASM-NEXT: 180002052: 48 8d 0d a7 21 00 00         leaq    0x21a7(%rip), %rcx      # 0x180004200
-DISASM-NEXT: 180002059: e8 aa ef ff ff               callq   0x180001008 <.text+0x8>
-DISASM-NEXT: 18000205e: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-DISASM-NEXT: 180002063: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-DISASM-NEXT: 180002069: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-DISASM-NEXT: 18000206f: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-DISASM-NEXT: 180002075: 48 83 c4 48                  addq    $0x48, %rsp
-DISASM-NEXT: 180002079: 41 59                        popq    %r9
-DISASM-NEXT: 18000207b: 41 58                        popq    %r8
-DISASM-NEXT: 18000207d: 5a                           popq    %rdx
-DISASM-NEXT: 18000207e: 59                           popq    %rcx
-DISASM-NEXT: 18000207f: ff e0                        jmpq    *%rax
+DISASM-NEXT: 18000202e: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+DISASM-NEXT: 180002033: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+DISASM-NEXT: 180002038: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+DISASM-NEXT: 18000203d: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+DISASM-NEXT: 180002042: 48 83 ec 68                  subq    $0x68, %rsp
+DISASM-NEXT: 180002046: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+DISASM-NEXT: 18000204c: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+DISASM-NEXT: 180002052: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+DISASM-NEXT: 180002058: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+DISASM-NEXT: 18000205e: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180002061: 48 8d 0d 90 21 00 00         leaq    0x2190(%rip), %rcx      # 0x1800041f8
+DISASM-NEXT: 180002068: e8 9b ef ff ff               callq   0x180001008 <.text+0x8>
+DISASM-NEXT: 18000206d: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+DISASM-NEXT: 180002073: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+DISASM-NEXT: 180002079: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+DISASM-NEXT: 18000207f: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+DISASM-NEXT: 180002085: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+DISASM-NEXT: 18000208a: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+DISASM-NEXT: 18000208f: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+DISASM-NEXT: 180002097: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+DISASM-NEXT: 18000209f: 48 83 c4 68                  addq    $0x68, %rsp
+DISASM-NEXT: 1800020a3: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
 LOADCFG:      CHPEMetadata [
@@ -85,7 +85,7 @@ IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   Attributes: 0x1
 IMPORTS-NEXT:   ModuleHandle: 0x7080
 IMPORTS-NEXT:   ImportAddressTable: 0x7088
-IMPORTS-NEXT:   ImportNameTable: 0x4240
+IMPORTS-NEXT:   ImportNameTable: 0x4238
 IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:   Import {
@@ -141,7 +141,7 @@ RELOC-NEXT:     Address: 0x6008
 RELOC-NEXT:   }
 
 RUN: llvm-readobj --hex-dump=.pdata out.dll | FileCheck --check-prefix=PDATA %s
-PDATA: 0x180008000 2e200000 81200000 18400000
+PDATA: 0x180008000 2e200000 a5200000 18400000
 
 Verify that a demangled version of __delayLoadHelper2 can be used.
 
diff --git a/lld/test/COFF/arm64x-delayimport.test b/lld/test/COFF/arm64x-delayimport.test
index 56923ef748d0..2a68bce79baa 100644
--- a/lld/test/COFF/arm64x-delayimport.test
+++ b/lld/test/COFF/arm64x-delayimport.test
@@ -21,7 +21,7 @@ IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   Attributes: 0x1
 IMPORTS-NEXT:   ModuleHandle: 0x6080
 IMPORTS-NEXT:   ImportAddressTable: 0x6088
-IMPORTS-NEXT:   ImportNameTable: 0x4390
+IMPORTS-NEXT:   ImportNameTable: 0x4388
 IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:   Import {
@@ -35,7 +35,7 @@ IMPORTS-NEXT:     Name: test.dll
 IMPORTS-NEXT:     Attributes: 0x1
 IMPORTS-NEXT:     ModuleHandle: 0x6080
 IMPORTS-NEXT:     ImportAddressTable: 0x6098
-IMPORTS-NEXT:     ImportNameTable: 0x43A0
+IMPORTS-NEXT:     ImportNameTable: 0x4398
 IMPORTS-NEXT:     BoundDelayImportTable: 0x0
 IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:     Import {
@@ -73,7 +73,7 @@ DISASM-NEXT: 180001040: ad0497e4     stp     q4, q5, [sp, #0x90]
 DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
 DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
 DISASM-NEXT: 18000104c: f0000000     adrp    x0, 0x180004000
-DISASM-NEXT: 180001050: 910d4000     add     x0, x0, #0x350
+DISASM-NEXT: 180001050: 910d2000     add     x0, x0, #0x348
 DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
 DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
 DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
@@ -105,28 +105,28 @@ DISASM-NEXT:                 ...
 DISASM-NEXT: 180003000: ff 25 92 30 00 00            jmpq    *0x3092(%rip)           # 0x180006098
 DISASM-NEXT: 180003006: 48 8d 05 8b 30 00 00         leaq    0x308b(%rip), %rax      # 0x180006098
 DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
-DISASM-NEXT: 180003012: 51                           pushq   %rcx
-DISASM-NEXT: 180003013: 52                           pushq   %rdx
-DISASM-NEXT: 180003014: 41 50                        pushq   %r8
-DISASM-NEXT: 180003016: 41 51                        pushq   %r9
-DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
-DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
-DISASM-NEXT: 180003036: 48 8d 0d 13 13 00 00         leaq    0x1313(%rip), %rcx # 0x180004350
-DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
-DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
-DISASM-NEXT: 18000305d: 41 59                        popq    %r9
-DISASM-NEXT: 18000305f: 41 58                        popq    %r8
-DISASM-NEXT: 180003061: 5a                           popq    %rdx
-DISASM-NEXT: 180003062: 59                           popq    %rcx
-DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+DISASM-NEXT: 180003012: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+DISASM-NEXT: 180003017: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+DISASM-NEXT: 18000301c: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+DISASM-NEXT: 180003021: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+DISASM-NEXT: 180003026: 48 83 ec 68                  subq    $0x68, %rsp
+DISASM-NEXT: 18000302a: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+DISASM-NEXT: 180003030: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+DISASM-NEXT: 180003036: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+DISASM-NEXT: 18000303c: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+DISASM-NEXT: 180003042: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180003045: 48 8d 0d fc 12 00 00         leaq    0x12fc(%rip), %rcx      # 0x180004348
+DISASM-NEXT: 18000304c: e8 b7 ef ff ff               callq   0x180002008 <.text+0x1008>
+DISASM-NEXT: 180003051: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+DISASM-NEXT: 180003057: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+DISASM-NEXT: 18000305d: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+DISASM-NEXT: 180003063: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+DISASM-NEXT: 180003069: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+DISASM-NEXT: 18000306e: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+DISASM-NEXT: 180003073: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+DISASM-NEXT: 18000307b: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+DISASM-NEXT: 180003083: 48 83 c4 68                  addq    $0x68, %rsp
+DISASM-NEXT: 180003087: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
 LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
@@ -230,7 +230,7 @@ EC-IMPORTS-NEXT:   Name: test.dll
 EC-IMPORTS-NEXT:   Attributes: 0x1
 EC-IMPORTS-NEXT:   ModuleHandle: 0x6080
 EC-IMPORTS-NEXT:   ImportAddressTable: 0x6088
-EC-IMPORTS-NEXT:   ImportNameTable: 0x4388
+EC-IMPORTS-NEXT:   ImportNameTable: 0x4380
 EC-IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 EC-IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 EC-IMPORTS-NEXT: }
@@ -243,7 +243,7 @@ EC-IMPORTS-NEXT:     Name: test.dll
 EC-IMPORTS-NEXT:     Attributes: 0x1
 EC-IMPORTS-NEXT:     ModuleHandle: 0x6080
 EC-IMPORTS-NEXT:     ImportAddressTable: 0x6090
-EC-IMPORTS-NEXT:     ImportNameTable: 0x4390
+EC-IMPORTS-NEXT:     ImportNameTable: 0x4388
 EC-IMPORTS-NEXT:     BoundDelayImportTable: 0x0
 EC-IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
 EC-IMPORTS-NEXT:     Import {
@@ -279,28 +279,28 @@ EC-DISASM-NEXT:                 ...
 EC-DISASM-NEXT: 180003000: ff 25 8a 30 00 00            jmpq    *0x308a(%rip)           # 0x180006090
 EC-DISASM-NEXT: 180003006: 48 8d 05 83 30 00 00         leaq    0x3083(%rip), %rax      # 0x180006090
 EC-DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
-EC-DISASM-NEXT: 180003012: 51                           pushq   %rcx
-EC-DISASM-NEXT: 180003013: 52                           pushq   %rdx
-EC-DISASM-NEXT: 180003014: 41 50                        pushq   %r8
-EC-DISASM-NEXT: 180003016: 41 51                        pushq   %r9
-EC-DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
-EC-DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-EC-DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-EC-DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-EC-DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-EC-DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
-EC-DISASM-NEXT: 180003036: 48 8d 0d 0b 13 00 00         leaq    0x130b(%rip), %rcx      # 0x180004348
-EC-DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
-EC-DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-EC-DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-EC-DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-EC-DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-EC-DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
-EC-DISASM-NEXT: 18000305d: 41 59                        popq    %r9
-EC-DISASM-NEXT: 18000305f: 41 58                        popq    %r8
-EC-DISASM-NEXT: 180003061: 5a                           popq    %rdx
-EC-DISASM-NEXT: 180003062: 59                           popq    %rcx
-EC-DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+EC-DISASM-NEXT: 180003012: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+EC-DISASM-NEXT: 180003017: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+EC-DISASM-NEXT: 18000301c: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+EC-DISASM-NEXT: 180003021: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+EC-DISASM-NEXT: 180003026: 48 83 ec 68                  subq    $0x68, %rsp
+EC-DISASM-NEXT: 18000302a: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+EC-DISASM-NEXT: 180003030: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+EC-DISASM-NEXT: 180003036: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+EC-DISASM-NEXT: 18000303c: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+EC-DISASM-NEXT: 180003042: 48 8b d0                     movq    %rax, %rdx
+EC-DISASM-NEXT: 180003045: 48 8d 0d f4 12 00 00         leaq    0x12f4(%rip), %rcx      # 0x180004340
+EC-DISASM-NEXT: 18000304c: e8 b7 ef ff ff               callq   0x180002008 <.text+0x1008>
+EC-DISASM-NEXT: 180003051: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+EC-DISASM-NEXT: 180003057: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+EC-DISASM-NEXT: 18000305d: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+EC-DISASM-NEXT: 180003063: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+EC-DISASM-NEXT: 180003069: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+EC-DISASM-NEXT: 18000306e: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+EC-DISASM-NEXT: 180003073: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+EC-DISASM-NEXT: 18000307b: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+EC-DISASM-NEXT: 180003083: 48 83 c4 68                  addq    $0x68, %rsp
+EC-DISASM-NEXT: 180003087: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out-ec.dll | FileCheck --check-prefix=EC-LOADCFG %s
 EC-LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
diff --git a/lld/test/COFF/delayimports.test b/lld/test/COFF/delayimports.test
index f410eef35fd1..ed074f462c7d 100644
--- a/lld/test/COFF/delayimports.test
+++ b/lld/test/COFF/delayimports.test
@@ -10,7 +10,7 @@ IMPORT-NEXT:   Name: std64.dll
 IMPORT-NEXT:   Attributes: 0x1
 IMPORT-NEXT:   ModuleHandle: 0x3018
 IMPORT-NEXT:   ImportAddressTable: 0x3020
-IMPORT-NEXT:   ImportNameTable: 0x2050
+IMPORT-NEXT:   ImportNameTable: 0x2048
 IMPORT-NEXT:   BoundDelayImportTable: 0x0
 IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 IMPORT-NEXT:   Import {
@@ -44,22 +44,18 @@ BASEREL-NEXT:   }
 UNWIND:      UnwindInformation [
 UNWIND-NEXT:   RuntimeFunction {
 UNWIND-NEXT:     StartAddress: (0x14000108A)
-UNWIND-NEXT:     EndAddress: (0x1400010DD)
+UNWIND-NEXT:     EndAddress: (0x140001101)
 UNWIND-NEXT:     UnwindInfoAddress: (0x140002000)
 UNWIND-NEXT:     UnwindInfo {
 UNWIND-NEXT:       Version: 1
 UNWIND-NEXT:       Flags [ (0x0)
 UNWIND-NEXT:       ]
-UNWIND-NEXT:       PrologSize: 10
+UNWIND-NEXT:       PrologSize: 24
 UNWIND-NEXT:       FrameRegister: -
 UNWIND-NEXT:       FrameOffset: -
-UNWIND-NEXT:       UnwindCodeCount: 5
+UNWIND-NEXT:       UnwindCodeCount: 1
 UNWIND-NEXT:       UnwindCodes [
-UNWIND-NEXT:         0x0A: ALLOC_SMALL size=72
-UNWIND-NEXT:         0x06: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x04: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x02: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x01: ALLOC_SMALL size=8
+UNWIND-NEXT:         0x18: ALLOC_SMALL size=104
 UNWIND-NEXT:       ]
 UNWIND-NEXT:     }
 UNWIND-NEXT:   }
diff --git a/lld/test/COFF/delayimporttables.yaml b/lld/test/COFF/delayimporttables.yaml
index cf54c0a7140a..ff6681257c83 100644
--- a/lld/test/COFF/delayimporttables.yaml
+++ b/lld/test/COFF/delayimporttables.yaml
@@ -15,7 +15,7 @@
 # CHECK-NEXT:   Attributes: 0x1
 # CHECK-NEXT:   ModuleHandle: 0x3000
 # CHECK-NEXT:   ImportAddressTable: 0x3010
-# CHECK-NEXT:   ImportNameTable: 0x2070
+# CHECK-NEXT:   ImportNameTable: 0x2068
 # CHECK-NEXT:   BoundDelayImportTable: 0x0
 # CHECK-NEXT:   UnloadDelayImportTable: 0x0
 # CHECK-NEXT:   Import {
@@ -32,16 +32,16 @@
 # CHECK-NEXT:   Attributes: 0x1
 # CHECK-NEXT:   ModuleHandle: 0x3008
 # CHECK-NEXT:   ImportAddressTable: 0x3028
-# CHECK-NEXT:   ImportNameTable: 0x2088
+# CHECK-NEXT:   ImportNameTable: 0x2080
 # CHECK-NEXT:   BoundDelayImportTable: 0x0
 # CHECK-NEXT:   UnloadDelayImportTable: 0x0
 # CHECK-NEXT:   Import {
 # CHECK-NEXT:     Symbol: left (0)
-# CHECK-NEXT:     Address: 0x1400010B8
+# CHECK-NEXT:     Address: 0x1400010DC
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Import {
 # CHECK-NEXT:     Symbol: right (0)
-# CHECK-NEXT:     Address: 0x1400010C4
+# CHECK-NEXT:     Address: 0x1400010E8
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 
diff --git a/lld/test/COFF/dtlto/files.test b/lld/test/COFF/dtlto/files.test
new file mode 100644
index 000000000000..4297adac9bbf
--- /dev/null
+++ b/lld/test/COFF/dtlto/files.test
@@ -0,0 +1,71 @@
+REQUIRES: x86
+
+## Test that the LLD options /lldsavetemps and -thinlto-emit-imports-files
+## function correctly with DTLTO we also check that index files 
+## (-thinlto-emit-index-files) are not emitted with DTLTO.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: sed 's/@t1/@t2/g' t1.ll > t2.ll
+
+## Generate ThinLTO bitcode files. Note that t3.bc will not be used by the
+## linker.
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+RUN: cp t1.bc t3.bc
+
+## Generate object files for mock.py to return.
+RUN: llc t1.ll --filetype=obj -o t1.obj
+RUN: llc t2.ll --filetype=obj -o t2.obj
+
+## Create response file containing shared ThinLTO linker arguments.
+## -start-lib/-end-lib is used to test the special case where unused lazy
+## bitcode inputs result in empty index/imports files.
+## Note that mock.py does not do any compilation; instead, it simply writes
+## the contents of the object files supplied on the command line into the
+## output object files in job order.
+RUN: echo "/entry:t1 /subsystem:console \
+RUN:   t1.bc t2.bc -start-lib t3.bc -end-lib /out:my.exe \
+RUN:   -thinlto-distributor:\"%python\" \
+RUN:   -thinlto-distributor-arg:\"%llvm_src_root/utils/dtlto/mock.py\" \
+RUN:   -thinlto-distributor-arg:t1.obj \
+RUN:   -thinlto-distributor-arg:t2.obj \
+RUN:   -thinlto-remote-compiler:fake.exe" > l.rsp
+
+## Check that without extra flags, no index/imports files are produced and
+## backend temp files are removed.
+RUN: lld-link @l.rsp
+RUN: ls | FileCheck %s \
+RUN:   --check-prefixes=NOBACKEND,NOOTHERS
+
+## Check that with /lldsavetemps and -thinlto-emit-imports-files backend 
+## tempoary files are retained and no index/imports files are produced.
+RUN: rm -f *.imports *.thinlto.bc
+RUN: lld-link @l.rsp  /lldsavetemps -thinlto-emit-imports-files
+RUN: ls | sort | FileCheck %s \
+RUN:   --check-prefixes=BACKEND,NOOTHERS
+
+## JSON jobs description, retained with --save-temps.
+## Note that DTLTO temporary files include a PID component.
+NOBACKEND-NOT: {{^}}my.[[#]].dist-file.json{{$}}
+BACKEND:       {{^}}my.[[#]].dist-file.json{{$}}
+
+## Index/imports files for t1.bc.
+NOOTHERS-NOT: {{^}}t1.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t1.bc.thinlto.bc{{$}}
+
+## Index/imports files for t2.bc.
+NOOTHERS-NOT: {{^}}t2.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t2.bc.thinlto.bc{{$}}
+
+## Empty index/imports files for unused t3.bc.
+NOOTHERS-NOT: {{^}}t3.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t3.bc.thinlto.bc{{$}}
+
+#--- t1.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @t1() {
+  ret void
+}
diff --git a/lld/test/COFF/dtlto/options.test b/lld/test/COFF/dtlto/options.test
new file mode 100644
index 000000000000..023ecd235910
--- /dev/null
+++ b/lld/test/COFF/dtlto/options.test
@@ -0,0 +1,56 @@
+REQUIRES: x86
+
+## Test that DTLTO-specific options are handled correctly.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: opt -thinlto-bc foo.ll -o foo.obj
+
+## Not specifying a value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe
+
+## Specifying an empty value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe \
+RUN:   -thinlto-remote-compiler:"" 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-remote-compiler:""
+
+COMPILER: error: A value must be specified for /thinlto-remote-compiler if /thinlto-distributor is specified.
+
+## Test that DTLTO options are passed correctly to the distributor and
+## remote compiler.
+## Note: validate.py does not perform any compilation. Instead, it validates the
+## received JSON, pretty-prints the JSON and the supplied arguments, and then
+## exits with an error. This allows FileCheck directives to verify the
+## distributor inputs.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/validate.py \
+RUN:   -thinlto-distributor-arg:darg1=10 \
+RUN:   -thinlto-distributor-arg:darg2=20 \
+RUN:   -thinlto-remote-compiler:my_clang.exe \
+RUN:   -thinlto-remote-compiler-arg:carg1=20 \
+RUN:   -thinlto-remote-compiler-arg:carg2=30 2>&1 | FileCheck %s
+
+CHECK: distributor_args=['darg1=10', 'darg2=20']
+
+CHECK: "linker_output": "my.exe"
+
+CHECK: "my_clang.exe"
+CHECK: "carg1=20"
+CHECK: "carg2=30"
+
+CHECK: error: DTLTO backend compilation: cannot open native object file:
+
+#--- foo.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @foo() {
+  ret void
+}
diff --git a/lld/test/COFF/giats.s b/lld/test/COFF/giats.s
index f870429f39d8..c0442107d3ae 100644
--- a/lld/test/COFF/giats.s
+++ b/lld/test/COFF/giats.s
@@ -37,14 +37,14 @@
 
 # DELAY-CHECK: ImageBase: 0x140000000
 # DELAY-CHECK: LoadConfig [
-# DELAY-CHECK:   GuardCFFunctionTable: 0x140002124
+# DELAY-CHECK:   GuardCFFunctionTable: 0x14000211C
 # DELAY-CHECK:   GuardCFFunctionCount: 2
 # DELAY-CHECK:   GuardFlags [ (0x10500)
 # DELAY-CHECK:     CF_FUNCTION_TABLE_PRESENT (0x400)
 # DELAY-CHECK:     CF_INSTRUMENTED (0x100)
 # DELAY-CHECK:     CF_LONGJUMP_TABLE_PRESENT (0x10000)
 # DELAY-CHECK:   ]
-# DELAY-CHECK:   GuardAddressTakenIatEntryTable: 0x14000212C
+# DELAY-CHECK:   GuardAddressTakenIatEntryTable: 0x140002124
 # DELAY-CHECK:   GuardAddressTakenIatEntryCount: 1
 # DELAY-CHECK: ]
 # DELAY-CHECK:      GuardFidTable [
diff --git a/lld/test/COFF/imported-dllmain-i386.test b/lld/test/COFF/imported-dllmain-i386.test
new file mode 100644
index 000000000000..f8aa09006999
--- /dev/null
+++ b/lld/test/COFF/imported-dllmain-i386.test
@@ -0,0 +1,58 @@
+REQUIRES: x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows a.s -o a.obj
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows b1.s -o b1.obj
+RUN: llvm-mc -filetype=obj -triple=i386-windows b2.s -o b2.obj
+
+### This is the line where our problem occurs. Here, we export the DllMain symbol which shouldn't happen normally.
+RUN: lld-link b1.obj b2.obj -out:b.dll -dll -implib:b.lib -entry:DllMain -export:bar -export:DllMain -safeseh:no
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows c.s -o c.obj
+RUN: lld-link -lib c.obj -out:c.lib
+
+### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -safeseh:no 2>&1 | FileCheck -check-prefix=WARN %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain -safeseh:no 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
+
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+
+DISASM: The Import Tables:
+DISASM: DLL Name: b.dll
+DISASM-NOT: DllMain
+DISASM: bar
+DISASM: Disassembly of section .text:
+DISASM:      b0 01                         movb    $0x1, %al
+DISASM-NEXT: c3                            retl
+
+#--- a.s
+        .text
+        .globl _foo
+_foo:
+        call *__imp__bar
+        ret
+
+#--- b1.s
+        .text
+        .globl _bar
+_bar:
+        ret
+
+#--- b2.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        xor al, al
+        ret
+
+#--- c.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        mov al, 1 
+        ret
diff --git a/lld/test/COFF/exported-dllmain.test b/lld/test/COFF/imported-dllmain.test
similarity index 86%
rename from lld/test/COFF/exported-dllmain.test
rename to lld/test/COFF/imported-dllmain.test
index fcf6ed100537..fa8579b1b41c 100644
--- a/lld/test/COFF/exported-dllmain.test
+++ b/lld/test/COFF/imported-dllmain.test
@@ -14,11 +14,11 @@ RUN: lld-link -lib c.obj -out:c.lib
 
 ### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
 RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain 2>&1 | FileCheck -check-prefix=WARN %s
-RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:exporteddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
 RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
 
-WARN: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
-IGNORED-NOT: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
 
 DISASM: The Import Tables:
 DISASM: DLL Name: b.dll
diff --git a/lld/test/COFF/pdb-empty-sec.s b/lld/test/COFF/pdb-empty-sec.s
new file mode 100644
index 000000000000..0d61447b7665
--- /dev/null
+++ b/lld/test/COFF/pdb-empty-sec.s
@@ -0,0 +1,19 @@
+// REQUIRES: x86
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: lld-link -dll -noentry -debug %t.obj -out:%t.dll
+// RUN: llvm-pdbutil dump -publics %t.pdb | FileCheck %s
+
+// CHECK:       Records
+// CHECK-NEXT:       0 | S_PUB32 [size = 20] `func`
+// CHECK-NEXT:           flags = none, addr = 0001:0000
+// CHECK-NEXT:      20 | S_PUB32 [size = 20] `sym`
+// CHECK-NEXT:           flags = none, addr = 0000:0000
+
+        .globl sym
+        .data
+sym:
+        .text
+        .globl func
+func:
+        ret
diff --git a/lld/test/COFF/thin-archive.s b/lld/test/COFF/thin-archive.s
index 55d71ea63567..7fab10c2b57b 100644
--- a/lld/test/COFF/thin-archive.s
+++ b/lld/test/COFF/thin-archive.s
@@ -22,23 +22,34 @@
 # SYMTAB:        ?f@@YAHXZ in
 # NO-SYMTAB-NOT: ?f@@YAHXZ in
 
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj /wholearchive:%t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
+# RUN: echo "/entry:main \"%t.main.obj\" /out:\"%t.exe\"" > %t.rsp
+
+# RUN: lld-link @%t.rsp %t.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_NON_THIN
+# RUN: lld-link @%t.rsp %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_SYM
+# RUN: lld-link @%t.rsp /wholearchive:%t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+# RUN: lld-link @%t.rsp /wholearchive %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+
+# LOAD_NON_THIN:   Loaded {{.*}}.lib({{.*}}.obj) for int __cdecl f(void)
+# LOAD_THIN_SYM:   Loaded {{.*}}.obj for int __cdecl f(void)
+# LOAD_THIN_WHOLE: Loaded {{.*}}.obj for <whole-archive>
 
 # RUN: rm %t.lib.obj
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib \
-# RUN:     /out:%t.exe 2>&1 | FileCheck --check-prefix=NOOBJ %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe \
-# RUN:     /demangle:no 2>&1 | FileCheck --check-prefix=NOOBJNODEMANGLE %s
-
-# CHECK-NOT: error: could not get the buffer for the member defining
+# RUN: lld-link @%t.rsp %t.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=ERR --allow-empty
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJ
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp /wholearchive:%t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJWHOLE
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib /demangle:no 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJNODEMANGLE
+
+# ERR-NOT: error: could not get the buffer for the member defining
 # NOOBJ: error: could not get the buffer for the member defining symbol int __cdecl f(void): {{.*}}.lib({{.*}}.lib.obj):
+# NOOBJWHOLE: error: {{.*}}.lib: could not get the buffer for a child of the archive: '{{.*}}.obj'
 # NOOBJNODEMANGLE: error: could not get the buffer for the member defining symbol ?f@@YAHXZ: {{.*}}.lib({{.*}}.lib.obj):
 
 	.text
diff --git a/lld/test/ELF/dtlto/archive-thin.test b/lld/test/ELF/dtlto/archive-thin.test
new file mode 100644
index 000000000000..df3c2aadb06e
--- /dev/null
+++ b/lld/test/ELF/dtlto/archive-thin.test
@@ -0,0 +1,65 @@
+REQUIRES: x86
+
+## Test that a DTLTO link assigns Module IDs to thin archive members as expected.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: sed 's/@t1/@t2/g' t1.ll > t2.ll
+RUN: sed 's/@t1/@t3/g' t1.ll > t3.ll
+
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+RUN: opt -thinlto-bc t3.ll -o t3.bc
+
+RUN: llvm-ar rcs t1.a t1.bc --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../t2.bc".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/t2.a t2.bc --thin
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: llvm-ar rcs t3.a %t/lib/../t3.bc --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+## Build a response file to share common linking arguments.
+## Note: validate.py does not perform any compilation. Instead, it validates the
+## received JSON, pretty-prints the JSON and the supplied arguments, and then
+## exits with an error. This allows FileCheck directives to verify the
+## distributor inputs.
+RUN: echo "%t/t1.a %t/lib/t2.a ../t3.a \
+RUN:   --thinlto-distributor=\"%python\" \
+RUN:   --thinlto-distributor-arg=\"%llvm_src_root/utils/dtlto/validate.py\"" > rsp
+
+## Link thin archives using -u/--undefined.
+RUN: not ld.lld @rsp -u t1 -u t2 -u t3 2>&1 | FileCheck %s
+
+## Link thin archives using --whole-archive.
+RUN: not ld.lld --whole-archive @rsp 2>&1 | FileCheck %s
+
+## Check the module IDs in the JSON jobs description.
+CHECK: "jobs": [
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t1.bc"
+
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]\:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t2.bc"
+
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t3.bc"
+
+## Ensure backend compilation fails as expected (due to validate.py dummy behavior).
+CHECK: error: DTLTO backend compilation: cannot open native object file:
+
+#--- t1.ll
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @t1() {
+  ret void
+}
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
deleted file mode 100644
index 53860b5daf2b..000000000000
--- a/lld/test/ELF/hexagon-jump-error.s
+++ /dev/null
@@ -1,32 +0,0 @@
-# REQUIRES: hexagon
-# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-## Use --threads=1 to keep emitted warnings across sections sequential.
-# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
-
-	.globl	_start
-	.type	_start, @function
-_start:
-
-# CHECK: relocation R_HEX_B9_PCREL out of range: 1028 is not in [-1024, 1023]
-{r0 = #0; jump #1f}
-.space (1<<10)
-.section b9, "ax"
-1:
-
-# CHECK: relocation R_HEX_B13_PCREL out of range: 16388 is not in [-16384, 16383]
-if (r0==#0) jump:t #1f
-.space (1<<14)
-.section b13, "ax"
-1:
-
-# CHECK: relocation R_HEX_B15_PCREL out of range: 65540 is not in [-65536, 65535]
-if (p0) jump #1f
-.space (1<<16)
-.section b15, "ax"
-1:
-
-# CHECK: relocation R_HEX_B22_PCREL out of range: 8388612 is not in [-8388608, 8388607]
-jump #1f
-.space (1<<23)
-.section b22, "ax"
-1:
diff --git a/lld/test/ELF/hexagon-plt.s b/lld/test/ELF/hexagon-plt.s
index 679de82923a7..780dc434a669 100644
--- a/lld/test/ELF/hexagon-plt.s
+++ b/lld/test/ELF/hexagon-plt.s
@@ -30,31 +30,31 @@
 # DIS:      <_start>:
 ## Direct call
 ## Call foo directly
-# DIS-NEXT:   { call 0x2003c }
+# DIS-NEXT:   { call 0x2003c <foo> }
 ## Call bar via plt
-# DIS-NEXT:   { call 0x20060 }
+# DIS-NEXT:   { call 0x20060 <bar@plt> }
 ## Call weak via plt
-# DIS-NEXT:   { call 0x20070 }
+# DIS-NEXT:   { call 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: if (p0) jump:nt 0x2003c }
+# DIS-NEXT: if (p0) jump:nt 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#64)
 ## Call bar via plt
-# DIS-NEXT: if (p0) jump:nt 0x20060 }
+# DIS-NEXT: if (p0) jump:nt 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#64)
 ## Call weak via plt
-# DIS-NEXT: if (p0) jump:nt 0x20070 }
+# DIS-NEXT: if (p0) jump:nt 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: r0 = #0 ; jump 0x2003c }
+# DIS-NEXT: r0 = #0 ; jump 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#0)
 ## Call bar via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20060 }
+# DIS-NEXT: r0 = #0 ; jump 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#0)
 ## Call weak via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20070 }
+# DIS-NEXT: r0 = #0 ; jump 0x20070 <weak@plt> }
 
 # DIS:      <foo>:
 # DIS-NEXT:   2003c:
diff --git a/lld/test/ELF/hexagon-shared.s b/lld/test/ELF/hexagon-shared.s
index cc62662d278e..7f7390f1fa8d 100644
--- a/lld/test/ELF/hexagon-shared.s
+++ b/lld/test/ELF/hexagon-shared.s
@@ -88,7 +88,7 @@ pvar:
 # PLT-NEXT: jumpr r28 }
 
 # TEXT:  bc 00 01 00 000100bc
-# TEXT: { 	call 0x10300 }
+# TEXT: { 	call 0x10300 <bar@plt> }
 # TEXT: if (p0) jump:nt 0x10300
 # TEXT: r0 = #0 ; jump 0x10300
 # TEXT: r0 = add(r1,##-65548)
diff --git a/lld/test/ELF/hexagon-thunk-range-b22rel.s b/lld/test/ELF/hexagon-thunk-range-b22rel.s
new file mode 100644
index 000000000000..08e37bf0a555
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-b22rel.s
@@ -0,0 +1,115 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_B22_PCREL relocations.
+## R_HEX_B22_PCREL has a range of +/- 8MB (0x800000 bytes).
+
+#--- main.s
+.globl _start
+.type _start, %function
+_start:
+  call target_within_range_max
+  call target_beyond_range
+  call target_within_range_min
+  call target_beyond_range_min
+  call target_multiple_calls
+  call target_multiple_calls
+  call target_close
+  jumpr r31
+
+target_close:
+  jumpr r31
+
+## Target at maximum positive range (8MB - 4 bytes from _start)
+## We need to account for the instructions above: 7 calls + 1 jumpr = 8 * 4 = 32 bytes
+.skip 0X7fffbc
+.globl target_within_range_max
+.type target_within_range_max, %function
+target_within_range_max:
+  jumpr r31
+
+## Target just beyond maximum positive range (needs thunk)
+.skip 8
+.globl target_beyond_range
+.type target_beyond_range, %function
+target_beyond_range:
+  call target_within_range_max
+  jumpr r31
+
+## Target for multiple calls test
+.skip 0x100000
+.globl target_multiple_calls
+.type target_multiple_calls, %function
+target_multiple_calls:
+  jumpr r31
+
+## Now place targets at maximum negative range
+## We'll put these before _start in memory layout
+.section .text_negative, "ax", %progbits
+
+## Target at maximum negative range (-8MB + 4 bytes from _start)
+.globl target_within_range_min
+.type target_within_range_min, %function
+target_within_range_min:
+  call target_close
+  jumpr r31
+
+.skip 0X7ffff4
+
+## Target beyond maximum negative range (needs thunk)
+.globl target_beyond_range_min
+.type target_beyond_range_min, %function
+target_beyond_range_min:
+  jumpr r31
+
+## Verify thunk generation for targets beyond B22_PCREL range
+# CHECK:       <__hexagon_thunk_target_within_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200b4: { immext(#0x900000)
+# CHECK-NEXT:             jump 0x9200cc <target_within_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_beyond_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200bc: { immext(#0x1100000)
+# CHECK-NEXT:             jump 0x11200c8 <target_beyond_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_multiple_calls_from_.text.thunk>:
+# CHECK-NEXT:    200c4: { immext(#0x8fffc0)
+# CHECK-NEXT:             jump 0x9200c0 <target_multiple_calls> }
+
+## Verify _start calls - some direct, some via thunks
+# CHECK:       <_start>:
+# CHECK-NEXT:    200cc: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:           { call 0x8200b8 <target_beyond_range> }
+# CHECK-NEXT:           { call 0x200b4 <__hexagon_thunk_target_within_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200bc <__hexagon_thunk_target_beyond_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200ec <target_close> }
+
+# CHECK:      <target_close>:
+# CHECK-NEXT:    200ec: { jumpr r31 }
+
+## Verify targets at maximum positive range (direct calls, no thunks needed)
+# CHECK:      <target_within_range_max>:
+# CHECK-NEXT:  8200ac: { jumpr r31 }
+
+# CHECK:      <target_beyond_range>:
+# CHECK-NEXT:  8200b8: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_multiple_calls>:
+# CHECK-NEXT:  9200c0: { jumpr r31 }
+
+## Verify targets in negative section and thunk for calling back to main section
+# CHECK:      <__hexagon_thunk__from_.text.thunk>:
+# CHECK-NEXT:  9200c4: { immext(#0xff700000)
+# CHECK-NEXT:            jump 0x200cc <_start> }
+
+# CHECK:      <target_within_range_min>:
+# CHECK-NEXT:  9200cc: { call 0x9200c4 <__hexagon_thunk__from_.text.thunk> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_beyond_range_min>:
+# CHECK-NEXT: 11200c8: { jumpr r31 }
diff --git a/lld/test/ELF/hexagon-thunk-range-gdplt.s b/lld/test/ELF/hexagon-thunk-range-gdplt.s
new file mode 100644
index 000000000000..77fd0e575456
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-gdplt.s
@@ -0,0 +1,95 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld -shared main.o -o test.so
+# RUN: llvm-objdump -d --no-show-raw-insn test.so | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_GD_PLT_B22_PCREL relocations.
+## Same ±8MB range as regular calls.
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Setup for TLS Global Dynamic calls
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+
+  ## Test TLS GD PLT calls
+  r0 = add(r2,##tls_var_close@GDGOT)
+  call tls_var_close@GDPLT
+
+  r0 = add(r2,##tls_var_far@GDGOT)
+  call tls_var_far@GDPLT
+
+  jumpr r31
+
+.skip 0x400000
+
+more_code:
+  r0 = add(r2,##tls_var_distant@GDGOT)
+  call tls_var_distant@GDPLT
+  jumpr r31
+
+## TLS variables in .tdata section
+.section .tdata,"awT",@progbits
+.globl tls_var_close, tls_var_far, tls_var_distant
+.type tls_var_close, @object
+.type tls_var_far, @object
+.type tls_var_distant, @object
+
+tls_var_close:
+  .word 0x1234
+
+tls_var_far:
+  .word 0x5678
+
+tls_var_distant:
+  .word 0x9abc
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:   102d4:  { immext(#0x420100)
+# CHECK-NEXT:      r2 = add(pc,##0x420130) }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10018) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10010) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+# CHECK:     <more_code>:
+# CHECK-NEXT:   4102f8:  { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10008) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT entries are created for TLS
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   410310:  { immext(#0x200c0)
+# CHECK-NEXT:      r28 = add(pc,##0x200f4) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <tls_var_far@plt>:
+# CHECK-NEXT:   410340:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200d8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <tls_var_distant@plt>:
+# CHECK-NEXT:   410350:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200cc) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <__tls_get_addr@plt>:
+# CHECK-NEXT:   410360: { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200c0) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunk-range-plt.s b/lld/test/ELF/hexagon-thunk-range-plt.s
new file mode 100644
index 000000000000..3a8f50b681d8
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-plt.s
@@ -0,0 +1,75 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf external.s -o external.o
+# RUN: ld.lld -shared external.o -soname external.so -o external.so
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o external.so -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_PLT_B22_PCREL relocations.
+## PLT calls use the same ±8MB range as regular calls but go through PLT entries.
+## This test verifies thunk generation for PLT calls at range boundaries.
+
+#--- external.s
+.globl extern_within_range, extern_beyond_range, extern_close
+.type extern_within_range, @function
+.type extern_beyond_range, @function
+.type extern_close, @function
+
+extern_within_range:
+  jumpr r31
+
+extern_beyond_range:
+  jumpr r31
+
+extern_close:
+  jumpr r31
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Test PLT calls to external functions at various ranges
+  call extern_within_range@PLT
+  call extern_beyond_range@PLT
+  call extern_close@PLT
+  jumpr r31
+
+.skip 0x200000
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:  2021c:  { call 0x220250 <extern_within_range@plt> }
+# CHECK-NEXT:    { call 0x220260 <extern_beyond_range@plt> }
+# CHECK-NEXT:    { call 0x220270 <extern_close@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT header and entries are created with exact addresses
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   220230:  { immext(#0x20080)
+# CHECK-NEXT:      r28 = add(pc,##0x200b8) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <extern_within_range@plt>:
+# CHECK-NEXT:   220250:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x200a8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_beyond_range@plt>:
+# CHECK-NEXT:   220260: { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x2009c) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_close@plt>:
+# CHECK-NEXT:   220270:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x20090) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunks-packets.s b/lld/test/ELF/hexagon-thunks-packets.s
new file mode 100644
index 000000000000..c8aaad4341ff
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks-packets.s
@@ -0,0 +1,122 @@
+# REQUIRES: hexagon
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld --pie %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+## Packets with pc-relative relocations are more interesting because
+## the offset must be relative to the start of the source, destination
+## packets and not necessarily the instruction word containing the jump/call.
+
+# CHECK:  Disassembly of section .text:
+
+# CHECK-NONPIC: 000200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-NONPIC: { immext(#0x1000040)
+# CHECK-NONPIC:   jump 0x1020110 <myfn_a> }
+
+# CHECK-PIC:    00010150 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0x1000040)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0x1000060) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
+
+# CHECK-NONPIC: 000200bc <myfn_b>:
+# CHECK-NONPIC: { jumpr r31 }
+# CHECK-PIC:    0001015c <myfn_b>:
+# CHECK-PIC:    { jumpr r31 }
+    .globl myfn_b
+    .type  myfn_b, @function
+myfn_b:
+    jumpr r31
+    .size  myfn_b, .-myfn_b
+
+# CHECK-PIC:    00010160 <main>:
+    .globl main
+    .type  main, @function
+main:
+    { r0 = #0
+      call myfn_a }
+# CHECK-PIC:      { call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC:   { call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:       r0 = #0x0 }
+    call myfn_a
+# CHECK-PIC:    call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC: call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+    call myfn_b
+# CHECK-PIC-NEXT:    call 0x1015c <myfn_b>
+# CHECK-NONPIC-NEXT: call 0x200bc <myfn_b>
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_b
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:     { if (p0) call 0x1015c <myfn_b>
+# CHECK-PIC-NEXT:       if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200bc <myfn_b>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+
+# CHECK-NEXT:       r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_a
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:  { if (p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:    if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      r1 = r4
+      r4 = r5
+      if (r0 == #0) jump:t #myfn_a }
+# CHECK-PIC-NEXT:     { if (r0==#0) jump:t 0x10150
+# CHECK-NONPIC-NEXT:  { if (r0==#0) jump:t 0x200b4
+# CHECK-NEXT:    r2 = add(r0,r1)
+# CHECK-NEXT:    r1 = r4; r4 = r5 }
+
+    { r2 = add(r0, r1)
+      r4 = r5
+      if (r0 <= #0) jump:t #myfn_a
+      p1 = cmp.eq(r0, #0); if (p1.new) jump:nt #myfn_a }
+# CHECK-NONPIC-NEXT:  { if (r0<=#0) jump:t 0x200b4
+# CHECK-NONPIC-NEXT:    p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:     { if (r0<=#0) jump:t 0x10150
+# CHECK-PIC-NEXT:       p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1)
+# CHECK-NEXT:           r4 = r5 }
+
+    {r0 = #0; jump #myfn_a}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+    {r0 = #0; jump #myfn_b}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x1015c <myfn_b> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200bc <myfn_b> }
+    jumpr r31
+    .size   main, .-main
+
+    .section .text.foo
+    .skip 0x1000000
+
+    .globl myfn_a
+    .type  myfn_a, @function
+myfn_a:
+    {r0 = #0; jump #myfn_b}
+    jumpr r31
+    .size  myfn_a, .-myfn_a
+
+# CHECK-NONPIC: 01020110 <myfn_a>:
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x1020118 <__hexagon_thunk_myfn_b_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { jumpr r31 }
+
+# CHECK-NONPIC: 01020118 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-NONPIC-NEXT: { immext(#0xfeffff80)
+# CHECK-NONPIC-NEXT:   jump 0x200bc <myfn_b> }
+
+# CHECK-PIC:    010101b8 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0xfeffff80)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0xfeffffa4) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
diff --git a/lld/test/ELF/hexagon-thunks.s b/lld/test/ELF/hexagon-thunks.s
new file mode 100644
index 000000000000..211074e1784b
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks.s
@@ -0,0 +1,53 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+# RUN: ld.lld -T %t/lds %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld -T %t/lds --pie %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+#--- a.s
+.section .text_low, "ax", %progbits
+    .globl main
+    .type  main, @function
+main:
+    call myfn
+    jumpr r31
+    .size   main, .-main
+
+.section .text_high, "ax", %progbits
+    .globl myfn
+    .type  myfn, @function
+myfn:
+    jumpr r31
+    .size  myfn, .-myfn
+
+# CHECK:  Disassembly of section .text_low:
+
+# CHECK:             <__hexagon_thunk_myfn_from_.text.thunk>:
+# CHECK-NONPIC-NEXT:    200b4: { immext(#0x1000000)
+# CHECK-NONPIC-NEXT:             jump 0x10200bc <myfn> }
+# CHECK-PIC-NEXT:       200b4: { immext(#0x1000000)
+# CHECK-PIC-NEXT:               r14 = add(pc,##0x1000008) }
+# CHECK-PIC-NEXT:       { jumpr r14 }
+
+# CHECK-NONPIC:      <main>:
+# CHECK-NONPIC-NEXT:   200bc: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-PIC:         <main>:
+# CHECK-PIC-NEXT:      200c0: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-NEXT:                 { jumpr r31 }
+
+# CHECK:  Disassembly of section .text_high:
+# CHECK:    <myfn>:
+# CHECK-NEXT: 10200bc: { jumpr r31 }
+
+#--- lds
+SECTIONS {
+  .text_low 0x200b4: { *(.text_low) }
+  .text_high 0x10200bc : { *(.text_high) }
+}
diff --git a/lld/test/ELF/hexagon-tls-allocateaux-multiple.s b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
new file mode 100644
index 000000000000..a77cc822e67d
--- /dev/null
+++ b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
@@ -0,0 +1,36 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf b.s -o b.o
+# RUN: ld.lld -shared a.o b.o -o out.so
+# RUN: llvm-readobj -r out.so | FileCheck --check-prefix=RELOC %s
+
+#--- a.s
+.globl _start
+.type _start, @function
+
+_start:
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+  r0 = add(r2,##tls_var@GDGOT)
+  call tls_var@GDPLT
+  jumpr r31
+
+.section .tdata,"awT",@progbits
+.globl tls_var
+.type tls_var, @object
+tls_var:
+  .word 0x1234
+
+#--- b.s
+.globl other_func
+.type other_func, @function
+
+other_func:
+  ## Direct call to __tls_get_addr - this creates another path that may
+  ## try to allocate auxiliary data for the same symbol
+  call __tls_get_addr
+  jumpr r31
+
+# RELOC:      Section ({{.*}}) .rela.plt {
+# RELOC:        R_HEX_JMP_SLOT __tls_get_addr 0x0
+# RELOC:      }
diff --git a/lld/test/ELF/hexagon-tls-gd-xform.s b/lld/test/ELF/hexagon-tls-gd-xform.s
index 65aeb118fcb3..ade54e8a16fa 100644
--- a/lld/test/ELF/hexagon-tls-gd-xform.s
+++ b/lld/test/ELF/hexagon-tls-gd-xform.s
@@ -18,10 +18,10 @@
 _start:
 .ifdef GDPLT
                         call x@gdplt
-# CHECK_GDPLT:  101ec: { call 0x10220 }
+# CHECK_GDPLT:  101ec: { call 0x10220 <__tls_get_addr@plt> }
 .else
                   call x
-# CHECK:  101b8: { call 0x101e0 }
+# CHECK:  101b8: { call 0x101e0 <x@plt> }
 .endif
 
 # CHECK_GDPLT:        10220: { immext(#0x20040)
diff --git a/lld/test/ELF/loongarch-pc-hi20-lo12-got.s b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s
new file mode 100644
index 000000000000..acd94007d0ff
--- /dev/null
+++ b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s
@@ -0,0 +1,145 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o
+
+# RUN: ld.lld a.o -T within-range.t -o a
+# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s
+
+## This test verifies the encoding when the register $a0 is used.
+# CHECK:      pcalau12i $a0, 0
+# CHECK-NEXT: addi.d    $a0, $a0, -2048
+
+## PCALAU12I contains a nonzero addend, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a1, 2
+# CHECK-NEXT: ld.d      $a1, $a1, -2048
+
+## LD contains a nonzero addend, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a2, 2
+# CHECK-NEXT: ld.d      $a2, $a2, -2040
+
+## PCALAU12I and LD use different registers, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a3, 2
+# CHECK-NEXT: ld.d      $a4, $a3, -2048
+
+## PCALAU12I and LD use different registers, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a5, 2
+# CHECK-NEXT: ld.d      $a5, $a6, -2048
+
+# RUN: ld.lld a.o -T underflow-range.t -o a-underflow
+# RUN: llvm-objdump -d --no-show-raw-insn a-underflow | FileCheck --check-prefix=OUTRANGE %s
+
+# RUN: ld.lld a.o -T overflow-range.t -o a-overflow
+# RUN: llvm-objdump -d --no-show-raw-insn a-overflow | FileCheck --check-prefix=OUTRANGE %s
+
+# OUTRANGE:      pcalau12i $a0, 1
+# OUTRANGE-NEXT: ld.d      $a0, $a0, 0
+
+## Relocations do not appear in pairs, no relaxations should be applied.
+# RUN: ld.lld unpaired.o -T within-range.t  -o unpaired
+# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s
+
+# UNPAIRED:         pcalau12i $a0, 2
+# UNPAIRED-NEXT:    b         8
+# UNPAIRED-NEXT:    pcalau12i $a0, 2
+# UNPAIRED:         ld.d      $a0, $a0, -2048
+
+## Relocations do not appear in pairs, no relaxations should be applied.
+# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr
+# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s
+
+# LONE-LDR:         ld.d   $a0, $a0, -2048
+
+## 32-bit code is mostly the same. We only test a few variants.
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o
+# RUN: ld.lld a.32.o -T within-range.t -o a32
+# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s
+
+## This test verifies the encoding when the register $a0 is used.
+# CHECK32:      pcalau12i $a0, 0
+# CHECK32-NEXT: addi.w    $a0, $a0, -2048
+
+
+## This linker script ensures that .rodata and .text are sufficiently close to
+## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add.
+#--- within-range.t
+SECTIONS {
+ .rodata 0x1800: { *(.rodata) }
+ .text   0x2800: { *(.text) }
+ .got    0x3800: { *(.got) }
+}
+
+## This linker script ensures that .rodata and .text are sufficiently far apart
+## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
+#--- underflow-range.t
+SECTIONS {
+ .rodata 0x800-4: { *(.rodata) }
+ .got    0x80002000: { *(.got) }
+ .text   0x80001000: { *(.text) }  /* (0x800-4)+2GB+0x800+4 */
+}
+
+#--- overflow-range.t
+SECTIONS {
+ .text   0x1000: { *(.text) }
+ .got    0x2000: { *(.got) }
+ .rodata 0x80000800 : { *(.rodata) }  /* 0x1000+2GB-0x800 */
+}
+
+#--- a.s
+## Symbol 'x' is nonpreemptible, the optimization should be applied.
+.rodata
+.hidden x
+x:
+.word 10
+
+.text
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  ld.d      $a0, $a0, %got_pc_lo12(x)
+  pcalau12i $a1, %got_pc_hi20(x+1)
+  ld.d      $a1, $a1, %got_pc_lo12(x)
+  pcalau12i $a2, %got_pc_hi20(x)
+  ld.d      $a2, $a2, %got_pc_lo12(x+8)
+  pcalau12i $a3, %got_pc_hi20(x)
+  ld.d      $a4, $a3, %got_pc_lo12(x)
+  pcalau12i $a5, %got_pc_hi20(x)
+  ld.d      $a5, $a6, %got_pc_lo12(x)
+
+#--- unpaired.s
+.text
+.hidden x
+x:
+  nop
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  b L
+  pcalau12i $a0, %got_pc_hi20(x)
+L:
+  ld.d      $a0, $a0, %got_pc_lo12(x)
+
+#--- lone-ldr.s
+.text
+.hidden x
+x:
+  nop
+.global _start
+_start:
+  ld.d     $a0, $a0, %got_pc_lo12(x)
+
+
+#--- a.32.s
+## Symbol 'x' is nonpreemptible, the optimization should be applied.
+.rodata
+.hidden x
+x:
+.word 10
+
+.text
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  ld.w      $a0, $a0, %got_pc_lo12(x)
diff --git a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
index a417d89e9fa2..08d5d3e950d8 100644
--- a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
+++ b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
@@ -1,22 +1,23 @@
 # REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax %s -o %t.32.o
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax a.s -o a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax a.s -o a.64.o
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.32.o -o %t.32
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.64.o -o %t.64
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck --check-prefixes=RELAX %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck --check-prefixes=RELAX %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.32.o -o a.32
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.64.o -o a.64
+# RUN: llvm-objdump -td --no-show-raw-insn a.32 | FileCheck --check-prefixes=RELAX %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64 | FileCheck --check-prefixes=RELAX %s
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.32.o -shared -o %t.32s
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.64.o -shared -o %t.64s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32s | FileCheck --check-prefixes=RELAX %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64s | FileCheck --check-prefixes=RELAX %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.32.o -shared -o a.32s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.64.o -shared -o a.64s
+# RUN: llvm-objdump -td --no-show-raw-insn a.32s | FileCheck --check-prefixes=RELAX %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64s | FileCheck --check-prefixes=RELAX %s
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 %t.32.o -o %t.32o
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 %t.64.o -o %t.64o
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32o | FileCheck --check-prefixes=NORELAX32 %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64o | FileCheck --check-prefixes=NORELAX64 %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 a.32.o -o a.32o
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 a.64.o -o a.64o
+# RUN: llvm-objdump -td --no-show-raw-insn a.32o | FileCheck --check-prefixes=NORELAX32 %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64o | FileCheck --check-prefixes=NORELAX64 %s
 
 # RELAX-LABEL: <_start>:
 ## offset = 0x14000 - 0x10000 = 4096<<2
@@ -30,25 +31,46 @@
 ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
 # NORELAX32-NEXT:  10000:  pcalau12i     $a0, 1024
 # NORELAX32-NEXT:          addi.w        $a0, $a0, 0
+## Not relaxation, convertion to PCRel.
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
-# NORELAX32-NEXT:          ld.w          $a0, $a0, 4
+# NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
 # NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
-# NORELAX32-NEXT:          ld.w          $a0, $a0, 4
+# NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 
 # NORELAX64-LABEL: <_start>:
 ## offset exceed range of pcaddi
 ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
 # NORELAX64-NEXT:  10000:  pcalau12i     $a0, 1024
 # NORELAX64-NEXT:          addi.d        $a0, $a0, 0
+## Not relaxation, convertion to PCRel.
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
-# NORELAX64-NEXT:          ld.d          $a0, $a0, 8
+# NORELAX64-NEXT:          addi.d        $a0, $a0, 0
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
 # NORELAX64-NEXT:          addi.d        $a0, $a0, 0
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
-# NORELAX64-NEXT:          ld.d          $a0, $a0, 8
+# NORELAX64-NEXT:          addi.d        $a0, $a0, 0
+
+
+## GOT references with non-zero addends. No relaxation.
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax nonzero.s -o nonzero.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax nonzero.s -o nonzero.64.o
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 nonzero.32.o -o nonzero.32
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 nonzero.64.o -o nonzero.64
+# RUN: llvm-objdump -td --no-show-raw-insn nonzero.32 | FileCheck --check-prefixes=NONZERO32 %s
+# RUN: llvm-objdump -td --no-show-raw-insn nonzero.64 | FileCheck --check-prefixes=NONZERO64 %s
+
+# NONZERO32-LABEL: <_start>:
+# NONZERO32-NEXT:      10000:  pcalau12i $a0, 4
+# NONZERO32-NEXT:              ld.w      $a0, $a0, 8
+
+# NONZERO64-LABEL: <_start>:
+# NONZERO64-NEXT:      10000:  pcalau12i $a0, 4
+# NONZERO64-NEXT:              ld.d      $a0, $a0, 12
+
 
+#--- a.s
 .section .text
 .global _start
 _start:
@@ -60,3 +82,14 @@ _start:
 .section .data
 sym:
   .zero 4
+
+
+#--- nonzero.s
+.section .text
+.global _start
+_start:
+  la.got    $a0, sym+4
+
+.section .data
+sym:
+  .zero 4
diff --git a/lld/test/ELF/loongarch-relax-tlsdesc.s b/lld/test/ELF/loongarch-relax-tlsdesc.s
index 5f4368343471..025cbc09fbdd 100644
--- a/lld/test/ELF/loongarch-relax-tlsdesc.s
+++ b/lld/test/ELF/loongarch-relax-tlsdesc.s
@@ -9,7 +9,6 @@
 # RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
 # RUN: llvm-objdump --no-show-raw-insn -dr -h a.64.so | FileCheck %s --check-prefix=GD64
 
-## FIXME: IE/LE relaxation have not yet been implemented, --relax/--no-relax obtain the same results.
 ## Transition from TLSDESC to IE/LE. Also check --emit-relocs.
 # RUN: ld.lld -e 0 -z now --emit-relocs a.64.o c.64.o -o a.64.le
 # RUN: llvm-readobj -r -x .got a.64.le 2>&1 | FileCheck --check-prefix=LE64-RELA %s
@@ -73,25 +72,21 @@
 # LE64-RELA: could not find section '.got'
 
 ## a@tprel = 0x8
-# LE64:        20158: nop
+# LE64:        20158: ori     $a0, $zero, 8
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          ori     $a0, $zero, 8
 # LE64-NEXT:            R_LARCH_TLS_DESC_CALL a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
 # LE64-NEXT:          add.d   $a1, $a0, $tp
 
 ## b@tprel = 0x7ff
-# LE64:        2016c: nop
+# LE64:        20160: nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 b
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 b
 # LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD b
@@ -101,7 +96,7 @@
 
 ## c@tprel = 0x800
 ## Without R_LARCH_RELAX relocation. No relaxation.
-# LE64:        20180: nop
+# LE64:        20170: nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 c
 # LE64-NEXT:          addi.d  $t0, $zero, 0
 # LE64-NEXT:          nop
@@ -115,13 +110,11 @@
 # LE64-NEXT:          add.d   $a3, $a0, $tp
 
 ## d@tprel = 0x1000
-# LE64:        201a0: nop
+# LE64:        20190: lu12i.w $a0, 1
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 d
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 d
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          lu12i.w $a0, 1
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD d
 # LE64-NEXT:          ori     $a0, $a0, 0
 # LE64-NEXT:            R_LARCH_TLS_DESC_CALL d
@@ -160,35 +153,31 @@
 # LE64-NORELAX-NEXT:          add.d   $a4, $a0, $tp
 
 # IE64-RELA:      .rela.dyn {
-# IE64-RELA-NEXT:   0x30408 R_LARCH_TLS_TPREL64 c 0x0
-# IE64-RELA-NEXT:   0x30410 R_LARCH_TLS_TPREL64 d 0x0
+# IE64-RELA-NEXT:   0x303F0 R_LARCH_TLS_TPREL64 c 0x0
+# IE64-RELA-NEXT:   0x303F8 R_LARCH_TLS_TPREL64 d 0x0
 # IE64-RELA-NEXT: }
 # IE64-RELA:      Hex dump of section '.got':
-# IE64-RELA-NEXT: 0x00030408 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303f0 00000000 00000000 00000000 00000000 .
 
-# IE64:   .got           00000010 0000000000030408
+# IE64:   .got           00000010 00000000000303f0
 
 ## a and b are optimized to use LE. c and d are optimized to IE.
 ## a@tprel = 0x8
-# IE64:        202c8: nop
+# IE64:        202c8: ori     $a0, $zero, 8
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          ori     $a0, $zero, 8
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
 # IE64-NEXT:          add.d   $a1, $a0, $tp
 
 ## b@tprel = 0x7ff
-# IE64:        202dc: nop
+# IE64:        202d0: nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 b
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 b
 # IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD b
@@ -196,9 +185,9 @@
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL b
 # IE64-NEXT:          add.d   $a2, $a0, $tp
 
-## &.got[c]-. = 0x30408 - 0x20300: 0x10 pages, page offset 0x408
+## &.got[c]-. = 0x303f0 - 0x202f0: 0x10 pages, page offset 0x3f0
 ## Without R_LARCH_RELAX relocation. No relaxation.
-# IE64:        202f0: nop
+# IE64:        202e0: nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 c
 # IE64-NEXT:          addi.d  $t0, $zero, 0
 # IE64-NEXT:          nop
@@ -207,20 +196,18 @@
 # IE64-NEXT:          pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD c
 # IE64-NEXT:          addi.d  $t0, $t0, 1
-# IE64-NEXT:          ld.d    $a0, $a0, 1032
+# IE64-NEXT:          ld.d    $a0, $a0, 1008
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL c
 # IE64-NEXT:          add.d   $a3, $a0, $tp
 
-## &.got[d]-. = 0x30408+8 - 0x20318: 0x10 pages, page offset 0x410
-# IE64:        20310: nop
+## &.got[d]-. = 0x303f0+8 - 0x20300: 0x10 pages, page offset 0x3f8
+# IE64:        20300: pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 d
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 d
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD d
-# IE64-NEXT:          ld.d    $a0, $a0, 1040
+# IE64-NEXT:          ld.d    $a0, $a0, 1016
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL d
 # IE64-NEXT:          add.d   $a4, $a0, $tp
 
diff --git a/lldb/docs/use/mcp.md b/lldb/docs/use/mcp.md
index 375c164fe771..b7474246b54f 100644
--- a/lldb/docs/use/mcp.md
+++ b/lldb/docs/use/mcp.md
@@ -75,7 +75,69 @@ Configuration example for [Visual Studio Code](https://code.visualstudio.com/doc
 }
 ```
 
-### Troubleshooting
+## Tools
+
+Tools are a primitive in the Model Context Protocol that enable servers to
+expose functionality to clients.
+
+LLDB's MCP integration exposes one tool, named `lldb_command` which allows the
+model to run the same commands a user would type in the LLDB command
+interpreter. It takes two arguments:
+
+1. The unique debugger ID as a number.
+2. The command and its arguments as a string.
+
+## Resources
+
+Resources are a primitive in the Model Context Protocol that allow servers to
+expose content that can be read by clients.
+
+LLDB's MCP integration exposes a resource for each debugger and target
+instance. Debugger resources are accessible using the following URI:
+
+```
+lldb://debugger/<debugger id>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1",
+      "mimeType": "application/json",
+      "text": "{\"debugger_id\":1,\"name\":\"debugger_1\",\"num_targets\":1}"
+    }
+  ]
+}
+```
+
+Debuggers can contain one or more targets, which are accessible using the
+following URI:
+
+```
+lldb://debugger/<debugger id>/target/<target idx>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1/target/0",
+      "mimeType": "application/json",
+      "text": "{\"arch\":\"arm64-apple-macosx26.0.0\",\"debugger_id\":1,\"dummy\":false,\"path\":\"/bin/count\",\"platform\":\"host\",\"selected\":true,\"target_idx\":0}"
+    }
+  ]
+}
+```
+
+Note that unlike the debugger id, which is unique, the target index is not
+stable and may be reused when a target is removed and a new target is added.
+
+## Troubleshooting
 
 The MCP server uses the `Host` log channel. You can enable logging with the
 `log enable` command.
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index a4ff96e4158c..a47ffabdecd0 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1380,6 +1380,9 @@ def isAArch64SMEFA64(self):
     def isAArch64MTE(self):
         return self.isAArch64() and "mte" in self.getCPUInfo()
 
+    def isAArch64MTEStoreOnly(self):
+        return self.isAArch64() and "mtestoreonly" in self.getCPUInfo()
+
     def isAArch64GCS(self):
         return self.isAArch64() and "gcs" in self.getCPUInfo()
 
diff --git a/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
index 35649b10ce45..a1aa7e1a69f3 100644
--- a/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
+++ b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
@@ -4,6 +4,14 @@
 #include <type_traits>
 #include <utility> // for std::forward
 
+// COMPRESSED_PAIR_REV versions:
+// 0 -> Post-c88580c layout
+// 1 -> Post-27c83382d83dc layout
+// 2 -> Post-769c42f4a552a layout
+// 3 -> Post-f5e687d7bf49c layout
+// 4 -> padding-less no_unique_address-based layout (introduced in
+// 27c83382d83dc)
+
 namespace std {
 namespace __lldb {
 
@@ -13,7 +21,50 @@ namespace __lldb {
 #define _LLDB_NO_UNIQUE_ADDRESS [[__no_unique_address__]]
 #endif
 
-#if COMPRESSED_PAIR_REV == 0 // Post-c88580c layout
+// From libc++ datasizeof.h
+template <class _Tp> struct _FirstPaddingByte {
+  _LLDB_NO_UNIQUE_ADDRESS _Tp __v_;
+  char __first_padding_byte_;
+};
+
+template <class _Tp>
+inline const size_t __datasizeof_v =
+    __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+
+template <class _Tp>
+struct __lldb_is_final : public integral_constant<bool, __is_final(_Tp)> {};
+
+// The legacy layout has been patched, see
+// https://github.com/llvm/llvm-project/pull/142516.
+#if COMPRESSED_PAIR_REV == 1
+template <class _ToPad> class __compressed_pair_padding {
+  char __padding_[((is_empty<_ToPad>::value &&
+                    !__lldb_is_final<_ToPad>::value) ||
+                   is_reference<_ToPad>::value)
+                      ? 0
+                      : sizeof(_ToPad) - __datasizeof_v<_ToPad>];
+};
+#elif COMPRESSED_PAIR_REV > 1 && COMPRESSED_PAIR_REV < 4
+template <class _ToPad>
+inline const bool __is_reference_or_unpadded_object =
+    (std::is_empty<_ToPad>::value && !__lldb_is_final<_ToPad>::value) ||
+    sizeof(_ToPad) == __datasizeof_v<_ToPad>;
+
+template <class _Tp>
+inline const bool __is_reference_or_unpadded_object<_Tp &> = true;
+
+template <class _Tp>
+inline const bool __is_reference_or_unpadded_object<_Tp &&> = true;
+
+template <class _ToPad, bool _Empty = __is_reference_or_unpadded_object<_ToPad>>
+class __compressed_pair_padding {
+  char __padding_[sizeof(_ToPad) - __datasizeof_v<_ToPad>] = {};
+};
+
+template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
+#endif // COMPRESSED_PAIR_REV == 1
+
+#if COMPRESSED_PAIR_REV == 0
 struct __value_init_tag {};
 struct __default_init_tag {};
 
@@ -59,49 +110,6 @@ class __compressed_pair : private __compressed_pair_elem<_T1, 0>,
   _T1 &first() { return static_cast<_Base1 &>(*this).__get(); }
 };
 #elif COMPRESSED_PAIR_REV == 1 || COMPRESSED_PAIR_REV == 2
-// From libc++ datasizeof.h
-template <class _Tp> struct _FirstPaddingByte {
-  _LLDB_NO_UNIQUE_ADDRESS _Tp __v_;
-  char __first_padding_byte_;
-};
-
-template <class _Tp>
-inline const size_t __datasizeof_v =
-    __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
-
-template <class _Tp>
-struct __lldb_is_final : public integral_constant<bool, __is_final(_Tp)> {};
-
-// The legacy layout has been patched, see
-// https://github.com/llvm/llvm-project/pull/142516.
-#if COMPRESSED_PAIR_REV == 1
-template <class _ToPad> class __compressed_pair_padding {
-  char __padding_[((is_empty<_ToPad>::value &&
-                    !__lldb_is_final<_ToPad>::value) ||
-                   is_reference<_ToPad>::value)
-                      ? 0
-                      : sizeof(_ToPad) - __datasizeof_v<_ToPad>];
-};
-#else
-template <class _ToPad>
-inline const bool __is_reference_or_unpadded_object =
-    (std::is_empty<_ToPad>::value && !__lldb_is_final<_ToPad>::value) ||
-    sizeof(_ToPad) == __datasizeof_v<_ToPad>;
-
-template <class _Tp>
-inline const bool __is_reference_or_unpadded_object<_Tp &> = true;
-
-template <class _Tp>
-inline const bool __is_reference_or_unpadded_object<_Tp &&> = true;
-
-template <class _ToPad, bool _Empty = __is_reference_or_unpadded_object<_ToPad>>
-class __compressed_pair_padding {
-  char __padding_[sizeof(_ToPad) - __datasizeof_v<_ToPad>] = {};
-};
-
-template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
-#endif
-
 #define _LLDB_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)              \
   [[__gnu__::__aligned__(                                                      \
       alignof(T2))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;                  \
@@ -119,6 +127,27 @@ template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
   _LLDB_NO_UNIQUE_ADDRESS T3 Initializer3;                                     \
   _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T3> __padding3_;
 #elif COMPRESSED_PAIR_REV == 3
+#define _LLDB_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)              \
+  struct {                                                                     \
+    [[__gnu__::__aligned__(                                                    \
+        alignof(T2))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;                \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T1> __padding1_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T2 Initializer2;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T2> __padding2_;         \
+  }
+
+#define _LLDB_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3,        \
+                                Initializer3)                                  \
+  struct {                                                                     \
+    [[using __gnu__: __aligned__(alignof(T2)),                                 \
+      __aligned__(alignof(T3))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;      \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T1> __padding1_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T2 Initializer2;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T2> __padding2_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T3 Initializer3;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T3> __padding3_;         \
+  }
+#elif COMPRESSED_PAIR_REV == 4
 #define _LLDB_COMPRESSED_PAIR(T1, Name1, T2, Name2)                            \
   _LLDB_NO_UNIQUE_ADDRESS T1 Name1;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T2 Name2
@@ -127,7 +156,7 @@ template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
   _LLDB_NO_UNIQUE_ADDRESS T1 Name1;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T2 Name2;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T3 Name3
-#endif
+#endif // COMPRESSED_PAIR_REV == 3
 } // namespace __lldb
 } // namespace std
 
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index a1de895c0ba9..c0b10797e506 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/WindowsError.h"
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <ctime>
 #include <io.h>
@@ -222,7 +223,7 @@ MainLoopWindows::RegisterReadObject(const IOObjectSP &object_sp,
 
   if (m_read_fds.find(waitable_handle) != m_read_fds.end()) {
     error = Status::FromErrorStringWithFormat(
-        "File descriptor %d already monitored.", waitable_handle);
+        "File descriptor %p already monitored.", waitable_handle);
     return nullptr;
   }
 
@@ -234,7 +235,7 @@ MainLoopWindows::RegisterReadObject(const IOObjectSP &object_sp,
   } else {
     DWORD file_type = GetFileType(waitable_handle);
     if (file_type != FILE_TYPE_PIPE) {
-      error = Status::FromErrorStringWithFormat("Unsupported file type %d",
+      error = Status::FromErrorStringWithFormat("Unsupported file type %ld",
                                                 file_type);
       return nullptr;
     }
diff --git a/lldb/source/Host/windows/PipeWindows.cpp b/lldb/source/Host/windows/PipeWindows.cpp
index 0b495fff69df..001396fafde0 100644
--- a/lldb/source/Host/windows/PipeWindows.cpp
+++ b/lldb/source/Host/windows/PipeWindows.cpp
@@ -279,7 +279,8 @@ llvm::Expected<size_t> PipeWindows::Read(void *buf, size_t size,
     return Status(failure_error, eErrorTypeWin32).takeError();
 
   DWORD timeout_msec =
-      timeout ? ceil<std::chrono::milliseconds>(*timeout).count() : INFINITE;
+      timeout ? std::chrono::ceil<std::chrono::milliseconds>(*timeout).count()
+              : INFINITE;
   DWORD wait_result =
       ::WaitForSingleObject(m_read_overlapped.hEvent, timeout_msec);
   if (wait_result != WAIT_OBJECT_0) {
@@ -324,7 +325,8 @@ llvm::Expected<size_t> PipeWindows::Write(const void *buf, size_t size,
     return Status(failure_error, eErrorTypeWin32).takeError();
 
   DWORD timeout_msec =
-      timeout ? ceil<std::chrono::milliseconds>(*timeout).count() : INFINITE;
+      timeout ? std::chrono::ceil<std::chrono::milliseconds>(*timeout).count()
+              : INFINITE;
   DWORD wait_result =
       ::WaitForSingleObject(m_write_overlapped.hEvent, timeout_msec);
   if (wait_result != WAIT_OBJECT_0) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index a7874047942c..6053d042b29b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -49,11 +49,6 @@ static void consumeInlineNamespace(llvm::StringRef &name) {
   }
 }
 
-bool lldb_private::formatters::isOldCompressedPairLayout(
-    ValueObject &pair_obj) {
-  return isStdTemplate(pair_obj.GetTypeName(), "__compressed_pair");
-}
-
 bool lldb_private::formatters::isStdTemplate(ConstString type_name,
                                              llvm::StringRef type) {
   llvm::StringRef name = type_name.GetStringRef();
@@ -105,6 +100,44 @@ lldb_private::formatters::GetSecondValueOfLibCXXCompressedPair(
   return value;
 }
 
+std::pair<lldb::ValueObjectSP, bool>
+lldb_private::formatters::GetValueOrOldCompressedPair(
+    ValueObject &obj, size_t anon_struct_idx, llvm::StringRef child_name,
+    llvm::StringRef compressed_pair_name) {
+  auto is_old_compressed_pair = [](ValueObject &pair_obj) -> bool {
+    return isStdTemplate(pair_obj.GetTypeName(), "__compressed_pair");
+  };
+
+  // Try searching the child member in an anonymous structure first.
+  if (auto unwrapped = obj.GetChildAtIndex(anon_struct_idx)) {
+    ValueObjectSP node_sp(obj.GetChildMemberWithName(child_name));
+    if (node_sp)
+      return {node_sp, is_old_compressed_pair(*node_sp)};
+  }
+
+  // Older versions of libc++ don't wrap the children in anonymous structures.
+  // Try that instead.
+  ValueObjectSP node_sp(obj.GetChildMemberWithName(child_name));
+  if (node_sp)
+    return {node_sp, is_old_compressed_pair(*node_sp)};
+
+  // Try the even older __compressed_pair layout.
+
+  assert(!compressed_pair_name.empty());
+
+  node_sp = obj.GetChildMemberWithName(compressed_pair_name);
+
+  // Unrecognized layout (possibly older than LLDB supports).
+  if (!node_sp)
+    return {nullptr, false};
+
+  // Expected old compressed_pair layout, but got something else.
+  if (!is_old_compressed_pair(*node_sp))
+    return {nullptr, false};
+
+  return {node_sp, true};
+}
+
 bool lldb_private::formatters::LibcxxFunctionSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
 
@@ -205,11 +238,12 @@ bool lldb_private::formatters::LibcxxUniquePointerSummaryProvider(
   if (!valobj_sp)
     return false;
 
-  ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
+  auto [ptr_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *valobj_sp, /*anon_struct_idx=*/0, "__ptr_", "__ptr_");
   if (!ptr_sp)
     return false;
 
-  if (isOldCompressedPairLayout(*ptr_sp))
+  if (is_compressed_pair)
     ptr_sp = GetFirstValueOfLibCXXCompressedPair(*ptr_sp);
 
   if (!ptr_sp)
@@ -379,13 +413,14 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return lldb::ChildCacheState::eRefetch;
 
-  ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
+  auto [ptr_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *valobj_sp, /*anon_struct_idx=*/0, "__ptr_", "__ptr_");
   if (!ptr_sp)
     return lldb::ChildCacheState::eRefetch;
 
   // Retrieve the actual pointer and the deleter, and clone them to give them
   // user-friendly names.
-  if (isOldCompressedPairLayout(*ptr_sp)) {
+  if (is_compressed_pair) {
     if (ValueObjectSP value_pointer_sp =
             GetFirstValueOfLibCXXCompressedPair(*ptr_sp))
       m_value_ptr_sp = value_pointer_sp->Clone(ConstString("pointer"));
@@ -424,17 +459,15 @@ enum class StringLayout { CSD, DSC };
 }
 
 static ValueObjectSP ExtractLibCxxStringData(ValueObject &valobj) {
-  if (auto rep_sp = valobj.GetChildMemberWithName("__rep_"))
-    return rep_sp;
-
-  ValueObjectSP valobj_r_sp = valobj.GetChildMemberWithName("__r_");
-  if (!valobj_r_sp || !valobj_r_sp->GetError().Success())
+  auto [valobj_r_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      valobj, /*anon_struct_idx=*/0, "__rep_", "__r_");
+  if (!valobj_r_sp)
     return nullptr;
 
-  if (!isOldCompressedPairLayout(*valobj_r_sp))
-    return nullptr;
+  if (is_compressed_pair)
+    return GetFirstValueOfLibCXXCompressedPair(*valobj_r_sp);
 
-  return GetFirstValueOfLibCXXCompressedPair(*valobj_r_sp);
+  return valobj_r_sp;
 }
 
 /// Determine the size in bytes of \p valobj (a libc++ std::string object) and
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index d88a6ecb1fa8..819f8a985f9b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -25,7 +25,22 @@ GetChildMemberWithName(ValueObject &obj,
 
 lldb::ValueObjectSP GetFirstValueOfLibCXXCompressedPair(ValueObject &pair);
 lldb::ValueObjectSP GetSecondValueOfLibCXXCompressedPair(ValueObject &pair);
-bool isOldCompressedPairLayout(ValueObject &pair_obj);
+
+/// Returns the ValueObjectSP of the child of \c obj. If \c obj has no
+/// child named \c child_name, returns the __compressed_pair child instead
+/// with \c compressed_pair_name, if one exists.
+///
+/// Latest libc++ wrap the compressed children in an anonymous structure.
+/// The \c anon_struct_idx indicates the location of this struct.
+///
+/// The returned boolean is \c true if the returned child was has an old-style
+/// libc++ __compressed_pair layout.
+///
+/// If no child was found returns a nullptr.
+std::pair<lldb::ValueObjectSP, bool>
+GetValueOrOldCompressedPair(ValueObject &obj, size_t anon_struct_idx,
+                            llvm::StringRef child_name,
+                            llvm::StringRef compressed_pair_name);
 bool isStdTemplate(ConstString type_name, llvm::StringRef type);
 
 bool LibcxxStringSummaryProviderASCII(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
index 826e6ab090e1..dfc23266fc14 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
@@ -297,11 +297,18 @@ lldb::ChildCacheState ForwardListFrontEnd::Update() {
   if (err.Fail() || !backend_addr)
     return lldb::ChildCacheState::eRefetch;
 
-  ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__before_begin_"));
+  auto list_base_sp = m_backend.GetChildAtIndex(0);
+  if (!list_base_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  // Anonymous strucutre index is in base class at index 0.
+  auto [impl_sp, is_compressed_pair] =
+      GetValueOrOldCompressedPair(*list_base_sp, /*anon_struct_idx=*/0,
+                                  "__before_begin_", "__before_begin_");
   if (!impl_sp)
     return ChildCacheState::eRefetch;
 
-  if (isOldCompressedPairLayout(*impl_sp))
+  if (is_compressed_pair)
     impl_sp = GetFirstValueOfLibCXXCompressedPair(*impl_sp);
 
   if (!impl_sp)
@@ -324,17 +331,10 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   if (!m_head || !m_tail || m_node_address == 0)
     return 0;
 
-  ValueObjectSP size_node_sp(m_backend.GetChildMemberWithName("__size_"));
-  if (!size_node_sp) {
-    size_node_sp = m_backend.GetChildMemberWithName(
-        "__size_alloc_"); // pre-compressed_pair rework
-
-    if (!isOldCompressedPairLayout(*size_node_sp))
-      return llvm::createStringError("Unexpected std::list layout: expected "
-                                     "old __compressed_pair layout.");
-
+  auto [size_node_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      m_backend, /*anon_struct_idx=*/1, "__size_", "__size_alloc_");
+  if (is_compressed_pair)
     size_node_sp = GetFirstValueOfLibCXXCompressedPair(*size_node_sp);
-  }
 
   if (size_node_sp)
     m_count = size_node_sp->GetValueAsUnsigned(UINT32_MAX);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index 41441dfbc718..85766966f155 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -200,7 +200,8 @@ class LibcxxStdMapSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
   llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
 
 private:
-  llvm::Expected<uint32_t> CalculateNumChildrenForOldCompressedPairLayout();
+  llvm::Expected<uint32_t>
+  CalculateNumChildrenForOldCompressedPairLayout(ValueObject &pair);
 
   /// Returns the ValueObject for the __tree_node type that
   /// holds the key/value pair of the node at index \ref idx.
@@ -254,16 +255,8 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
 
 llvm::Expected<uint32_t>
 lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
-    CalculateNumChildrenForOldCompressedPairLayout() {
-  ValueObjectSP node_sp(m_tree->GetChildMemberWithName("__pair3_"));
-  if (!node_sp)
-    return 0;
-
-  if (!isOldCompressedPairLayout(*node_sp))
-    return llvm::createStringError("Unexpected std::map layout: expected "
-                                   "old __compressed_pair layout.");
-
-  node_sp = GetFirstValueOfLibCXXCompressedPair(*node_sp);
+    CalculateNumChildrenForOldCompressedPairLayout(ValueObject &pair) {
+  auto node_sp = GetFirstValueOfLibCXXCompressedPair(pair);
 
   if (!node_sp)
     return 0;
@@ -281,12 +274,16 @@ llvm::Expected<uint32_t> lldb_private::formatters::
   if (m_tree == nullptr)
     return 0;
 
-  if (auto node_sp = m_tree->GetChildMemberWithName("__size_")) {
-    m_count = node_sp->GetValueAsUnsigned(0);
-    return m_count;
-  }
+  auto [size_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *m_tree, /*anon_struct_idx=*/2, "__size_", "__pair3_");
+  if (!size_sp)
+    return llvm::createStringError("Unexpected std::map layout");
 
-  return CalculateNumChildrenForOldCompressedPairLayout();
+  if (is_compressed_pair)
+    return CalculateNumChildrenForOldCompressedPairLayout(*size_sp);
+
+  m_count = size_sp->GetValueAsUnsigned(0);
+  return m_count;
 }
 
 ValueObjectSP
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 501fd0945b82..f88a5319068a 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -130,22 +130,17 @@ CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
 
 CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     GetNodeType() {
-  auto node_sp = m_backend.GetChildAtNamePath({"__table_", "__first_node_"});
-
-  if (!node_sp) {
-    auto p1_sp = m_backend.GetChildAtNamePath({"__table_", "__p1_"});
-    if (!p1_sp)
-      return {};
+  auto table_sp = m_backend.GetChildMemberWithName("__table_");
+  if (!table_sp)
+    return {};
 
-    if (!isOldCompressedPairLayout(*p1_sp))
-      return {};
+  auto [node_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *table_sp, /*anon_struct_idx=*/1, "__first_node_", "__p1_");
+  if (is_compressed_pair)
+    node_sp = GetFirstValueOfLibCXXCompressedPair(*node_sp);
 
-    node_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
-    if (!node_sp)
-      return {};
-  }
-
-  assert(node_sp);
+  if (!node_sp)
+    return {};
 
   return node_sp->GetCompilerType().GetTypeTemplateArgument(0).GetPointeeType();
 }
@@ -223,19 +218,15 @@ lldb::ValueObjectSP lldb_private::formatters::
 llvm::Expected<size_t>
 lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     CalculateNumChildrenImpl(ValueObject &table) {
-  if (auto size_sp = table.GetChildMemberWithName("__size_"))
+  auto [size_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      table, /*anon_struct_idx=*/2, "__size_", "__p2_");
+  if (!is_compressed_pair && size_sp)
     return size_sp->GetValueAsUnsigned(0);
 
-  ValueObjectSP p2_sp = table.GetChildMemberWithName("__p2_");
-  if (!p2_sp)
-    return llvm::createStringError(
-        "Unexpected std::unordered_map layout: __p2_ member not found.");
+  if (!is_compressed_pair)
+    return llvm::createStringError("Unsupported std::unordered_map layout.");
 
-  if (!isOldCompressedPairLayout(*p2_sp))
-    return llvm::createStringError("Unexpected std::unordered_map layout: old "
-                                   "__compressed_pair layout not found.");
-
-  ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*p2_sp);
+  ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*size_sp);
 
   if (!num_elements_sp)
     return llvm::createStringError(
@@ -246,19 +237,13 @@ lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
 }
 
 static ValueObjectSP GetTreePointer(ValueObject &table) {
-  ValueObjectSP tree_sp = table.GetChildMemberWithName("__first_node_");
-  if (!tree_sp) {
-    ValueObjectSP p1_sp = table.GetChildMemberWithName("__p1_");
-    if (!p1_sp)
-      return nullptr;
-
-    if (!isOldCompressedPairLayout(*p1_sp))
-      return nullptr;
-
-    tree_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
-    if (!tree_sp)
-      return nullptr;
-  }
+  auto [tree_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      table, /*anon_struct_idx=*/1, "__first_node_", "__p1_");
+  if (is_compressed_pair)
+    tree_sp = GetFirstValueOfLibCXXCompressedPair(*tree_sp);
+
+  if (!tree_sp)
+    return nullptr;
 
   return tree_sp->GetChildMemberWithName("__next_");
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
index 4bcdf01c221a..60913e5c1ac5 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
@@ -126,17 +126,15 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::GetChildAtIndex(
 }
 
 static ValueObjectSP GetDataPointer(ValueObject &root) {
-  if (auto cap_sp = root.GetChildMemberWithName("__cap_"))
-    return cap_sp;
-
-  ValueObjectSP cap_sp = root.GetChildMemberWithName("__end_cap_");
+  auto [cap_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      root, /*anon_struct_idx=*/2, "__cap_", "__end_cap_");
   if (!cap_sp)
     return nullptr;
 
-  if (!isOldCompressedPairLayout(*cap_sp))
-    return nullptr;
+  if (is_compressed_pair)
+    return GetFirstValueOfLibCXXCompressedPair(*cap_sp);
 
-  return GetFirstValueOfLibCXXCompressedPair(*cap_sp);
+  return cap_sp;
 }
 
 lldb::ChildCacheState
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
index 7adc00622ec2..d21dac221aa2 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
@@ -44,7 +44,8 @@ NativeRegisterContextFreeBSD::CreateHostNativeRegisterContextFreeBSD(
     NativeProcessFreeBSD &process = native_thread.GetProcess();
     g_register_flags_detector.DetectFields(
         process.GetAuxValue(AuxVector::AUXV_FREEBSD_AT_HWCAP).value_or(0),
-        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP2).value_or(0));
+        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP2).value_or(0),
+        /*hwcap3=*/0);
   }
 
   return new NativeRegisterContextFreeBSD_arm64(target_arch, native_thread);
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index 884c7d4b9e35..b1c7421bef8d 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -162,10 +162,13 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux(
 
     opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskTLS);
 
+    std::optional<uint64_t> auxv_at_hwcap3 =
+        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP3);
     std::lock_guard<std::mutex> lock(g_register_flags_detector_mutex);
     if (!g_register_flags_detector.HasDetected())
       g_register_flags_detector.DetectFields(auxv_at_hwcap.value_or(0),
-                                             auxv_at_hwcap2.value_or(0));
+                                             auxv_at_hwcap2.value_or(0),
+                                             auxv_at_hwcap3.value_or(0));
 
     auto register_info_up =
         std::make_unique<RegisterInfoPOSIX_arm64>(target_arch, opt_regsets);
diff --git a/lldb/source/Plugins/Process/Utility/AuxVector.cpp b/lldb/source/Plugins/Process/Utility/AuxVector.cpp
index f495ffb1924e..50500a8593e1 100644
--- a/lldb/source/Plugins/Process/Utility/AuxVector.cpp
+++ b/lldb/source/Plugins/Process/Utility/AuxVector.cpp
@@ -84,6 +84,7 @@ const char *AuxVector::GetEntryName(EntryType type) const {
     case ENTRY_NAME(AUXV_AT_BASE_PLATFORM);  break;
     case ENTRY_NAME(AUXV_AT_RANDOM);         break;
     case ENTRY_NAME(AUXV_AT_HWCAP2);         break;
+    case ENTRY_NAME(AUXV_AT_HWCAP3);         break;
     case ENTRY_NAME(AUXV_AT_EXECFN);         break;
     case ENTRY_NAME(AUXV_AT_SYSINFO);        break;
     case ENTRY_NAME(AUXV_AT_SYSINFO_EHDR);   break;
diff --git a/lldb/source/Plugins/Process/Utility/AuxVector.h b/lldb/source/Plugins/Process/Utility/AuxVector.h
index 2670b34f6b0a..7733e0ffc683 100644
--- a/lldb/source/Plugins/Process/Utility/AuxVector.h
+++ b/lldb/source/Plugins/Process/Utility/AuxVector.h
@@ -57,6 +57,7 @@ class AuxVector {
     AUXV_AT_BASE_PLATFORM = 24, ///< String identifying real platforms.
     AUXV_AT_RANDOM = 25,        ///< Address of 16 random bytes.
     AUXV_AT_HWCAP2 = 26,        ///< Extension of AT_HWCAP.
+    AUXV_AT_HWCAP3 = 29,        ///< Extension of AT_HWCAP.
     AUXV_AT_EXECFN = 31,        ///< Filename of executable.
     AUXV_AT_SYSINFO = 32, ///< Pointer to the global system page used for system
                           /// calls and other nice things.
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
index 042940b7dff6..330a24af67c4 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
@@ -26,11 +26,15 @@
 #define HWCAP2_EBF16 (1ULL << 32)
 #define HWCAP2_FPMR (1ULL << 48)
 
+#define HWCAP3_MTE_STORE_ONLY (1ULL << 1)
+
 using namespace lldb_private;
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   (void)hwcap;
+  (void)hwcap3;
 
   if (!(hwcap2 & HWCAP2_FPMR))
     return {};
@@ -53,8 +57,10 @@ Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
 
 Arm64RegisterFlagsDetector::Fields
 Arm64RegisterFlagsDetector::DetectGCSFeatureFields(uint64_t hwcap,
-                                                   uint64_t hwcap2) {
+                                                   uint64_t hwcap2,
+                                                   uint64_t hwcap3) {
   (void)hwcap2;
+  (void)hwcap3;
 
   if (!(hwcap & HWCAP_GCS))
     return {};
@@ -67,8 +73,10 @@ Arm64RegisterFlagsDetector::DetectGCSFeatureFields(uint64_t hwcap,
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   (void)hwcap;
+  (void)hwcap3;
 
   if (!(hwcap2 & HWCAP2_SME))
     return {};
@@ -83,8 +91,8 @@ Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap,
-                                                uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2,
+                                                uint64_t hwcap3) {
   (void)hwcap;
 
   if (!(hwcap2 & HWCAP2_MTE))
@@ -94,16 +102,29 @@ Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap,
   // to prctl(PR_TAGGED_ADDR_CTRL...). Fields are derived from the defines
   // used to build the value.
 
+  std::vector<RegisterFlags::Field> fields;
+  fields.reserve(4);
+  if (hwcap3 & HWCAP3_MTE_STORE_ONLY)
+    fields.push_back({"STORE_ONLY", 19});
+
   static const FieldEnum tcf_enum(
       "tcf_enum",
       {{0, "TCF_NONE"}, {1, "TCF_SYNC"}, {2, "TCF_ASYNC"}, {3, "TCF_ASYMM"}});
-  return {{"TAGS", 3, 18}, // 16 bit bitfield shifted up by PR_MTE_TAG_SHIFT.
-          {"TCF", 1, 2, &tcf_enum},
-          {"TAGGED_ADDR_ENABLE", 0}};
+
+  fields.insert(
+      std::end(fields),
+      {{"TAGS", 3, 18}, // 16 bit bitfield shifted up by PR_MTE_TAG_SHIFT.
+       {"TCF", 1, 2, &tcf_enum},
+       {"TAGGED_ADDR_ENABLE", 0}});
+
+  return fields;
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
+  (void)hwcap3;
+
   static const FieldEnum rmode_enum(
       "rmode_enum", {{0, "RN"}, {1, "RP"}, {2, "RM"}, {3, "RZ"}});
 
@@ -142,10 +163,12 @@ Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   // fpsr's contents are constant.
   (void)hwcap;
   (void)hwcap2;
+  (void)hwcap3;
 
   return {
       // Bits 31-28 are N/Z/C/V, only used by AArch32.
@@ -162,7 +185,10 @@ Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
+  (void)hwcap3;
+
   // The fields here are a combination of the Arm manual's SPSR_EL1,
   // plus a few changes where Linux has decided not to make use of them at all,
   // or at least not from userspace.
@@ -207,9 +233,10 @@ Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2) {
   return cpsr_fields;
 }
 
-void Arm64RegisterFlagsDetector::DetectFields(uint64_t hwcap, uint64_t hwcap2) {
+void Arm64RegisterFlagsDetector::DetectFields(uint64_t hwcap, uint64_t hwcap2,
+                                              uint64_t hwcap3) {
   for (auto &reg : m_registers)
-    reg.m_flags.SetFields(reg.m_detector(hwcap, hwcap2));
+    reg.m_flags.SetFields(reg.m_detector(hwcap, hwcap2, hwcap3));
   m_has_detected = true;
 }
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
index 7daebcc71db0..aec2bf9f4886 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
@@ -40,7 +40,7 @@ class Arm64RegisterFlagsDetector {
   /// If called more than once, fields will be redetected each time from
   /// scratch. If the target would not have this register at all, the list of
   /// fields will be left empty.
-  void DetectFields(uint64_t hwcap, uint64_t hwcap2);
+  void DetectFields(uint64_t hwcap, uint64_t hwcap2, uint64_t hwcap3);
 
   /// Add the field information of any registers named in this class,
   /// to the relevant RegisterInfo instances. Note that this will be done
@@ -53,15 +53,22 @@ class Arm64RegisterFlagsDetector {
 
 private:
   using Fields = std::vector<RegisterFlags::Field>;
-  using DetectorFn = std::function<Fields(uint64_t, uint64_t)>;
+  using DetectorFn = std::function<Fields(uint64_t, uint64_t, uint64_t)>;
 
-  static Fields DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectGCSFeatureFields(uint64_t hwcap, uint64_t hwcap2);
+  static Fields DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2,
+                                    uint64_t hwcap3);
+  static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectGCSFeatureFields(uint64_t hwcap, uint64_t hwcap2,
+                                       uint64_t hwcap3);
 
   struct RegisterEntry {
     RegisterEntry(llvm::StringRef name, unsigned size, DetectorFn detector)
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
index bd02bb0e69a4..d5046d369ab2 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
@@ -96,14 +96,19 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
   llvm::Triple::OSType os = process->GetArchitecture().GetTriple().getOS();
   if ((os == llvm::Triple::Linux) || (os == llvm::Triple::FreeBSD)) {
     AuxVector aux_vec(process->GetAuxvData());
-    std::optional<uint64_t> auxv_at_hwcap = aux_vec.GetAuxValue(
-        os == llvm::Triple::FreeBSD ? AuxVector::AUXV_FREEBSD_AT_HWCAP
-                                    : AuxVector::AUXV_AT_HWCAP);
+    bool is_freebsd = os == llvm::Triple::FreeBSD;
+    std::optional<uint64_t> auxv_at_hwcap =
+        aux_vec.GetAuxValue(is_freebsd ? AuxVector::AUXV_FREEBSD_AT_HWCAP
+                                       : AuxVector::AUXV_AT_HWCAP);
     std::optional<uint64_t> auxv_at_hwcap2 =
         aux_vec.GetAuxValue(AuxVector::AUXV_AT_HWCAP2);
+    std::optional<uint64_t> auxv_at_hwcap3 =
+        is_freebsd ? std::nullopt
+                   : aux_vec.GetAuxValue(AuxVector::AUXV_AT_HWCAP3);
 
     m_register_flags_detector.DetectFields(auxv_at_hwcap.value_or(0),
-                                           auxv_at_hwcap2.value_or(0));
+                                           auxv_at_hwcap2.value_or(0),
+                                           auxv_at_hwcap3.value_or(0));
     m_register_flags_detector.UpdateRegisterInfo(GetRegisterInfo(),
                                                  GetRegisterCount());
   }
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 4aa9e046d607..656503bb8d22 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -5,8 +5,8 @@ let Definition = "target_experimental" in {
     Global, DefaultTrue,
     Desc<"If true, inject local variables explicitly into the expression text. This will fix symbol resolution when there are name collisions between ivars and local variables. But it can make expressions run much more slowly.">;
   def UseDIL : Property<"use-DIL", "Boolean">,
-    Global, DefaultFalse,
-    Desc<"If true, use the alternative DIL implementation for frame variable evaluation.">;
+    Global, DefaultTrue,
+    Desc<"If true, use the DIL implementation for frame variable evaluation.">;
 }
 
 let Definition = "target" in {
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index fd3f9f872460..6f28434c646c 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -303,7 +303,7 @@ Interpreter::Visit(const MemberOfNode *node) {
     }
   }
 
-  if (field_obj && field_obj->GetName() == node->GetFieldName()) {
+  if (field_obj) {
     if (m_use_dynamic != lldb::eNoDynamicValues) {
       lldb::ValueObjectSP dynamic_val_sp =
           field_obj->GetDynamicValue(m_use_dynamic);
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
index b2ce9602e6a5..8c009aa182d0 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
@@ -29,3 +29,17 @@ def test_frame_var(self):
         self.expect_var_path("ns::i", value="1")
         self.expect_var_path("::ns::ns::i", value="2")
         self.expect_var_path("ns::ns::i", value="2")
+
+        self.expect_var_path("foo", value="1")
+        self.expect_var_path("::(anonymous namespace)::foo", value="13")
+        self.expect_var_path("(anonymous namespace)::foo", value="13")
+        self.expect_var_path("ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
+        self.expect_var_path("::ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "::(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
index 8a5c47a6f364..10ffa1e54a99 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
@@ -10,7 +10,26 @@ int i = 2;
 
 } // namespace ns
 
+namespace {
+int foo = 13;
+}
+
+namespace ns1 {
+namespace {
+int foo = 5;
+}
+} // namespace ns1
+
+namespace {
+namespace ns2 {
+namespace {
+int foo = 7;
+}
+} // namespace ns2
+} // namespace
+
 int main(int argc, char **argv) {
+  int foo = 1;
 
-  return 0; // Set a breakpoint here
+  return foo + ::foo + ns1::foo + ns2::foo; // Set a breakpoint here
 }
diff --git a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
index 2570f267bf46..c003d87f8ca3 100644
--- a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
+++ b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
@@ -34,29 +34,41 @@ def test_mte_ctrl_register(self):
             substrs=["stop reason = breakpoint 1."],
         )
 
-        def check_mte_ctrl(async_err, sync_err):
+        has_store_only = self.isAArch64MTEStoreOnly()
+
+        def check_mte_ctrl(async_err, sync_err, store_only):
             # Bit 0 = tagged addressing enabled
             # Bit 1 = synchronous faults
             # Bit 2 = asynchronous faults
-            value = "0x{:016x}".format((async_err << 2) | (sync_err << 1) | 1)
+            # Bit 19 = store only checking mode
+            value = "0x{:016x}".format(
+                (store_only << 19) | (async_err << 2) | (sync_err << 1) | 1
+            )
             expected = [value]
 
             if self.hasXMLSupport():
+                fields = "("
+                if has_store_only:
+                    fields += f"STORE_ONLY = {store_only}, "
+
                 tfc_modes = ["NONE", "SYNC", "ASYNC", "ASYMM"]
-                expected.append(
-                    f"(TAGS = 0, TCF = TCF_{tfc_modes[async_err << 1 | sync_err]}, TAGGED_ADDR_ENABLE = 1)".format(
-                        async_err, sync_err
-                    )
-                )
+                fields += f"TAGS = 0, TCF = TCF_{tfc_modes[async_err << 1 | sync_err]}, TAGGED_ADDR_ENABLE = 1)"
+
+                expected.append(fields)
 
             self.expect("register read mte_ctrl", substrs=expected)
 
         # We start enabled with synchronous faults.
-        check_mte_ctrl(0, 1)
+        check_mte_ctrl(0, 1, 0)
         # Change to asynchronous faults.
         self.runCmd("register write mte_ctrl 5")
-        check_mte_ctrl(1, 0)
+        check_mte_ctrl(1, 0, 0)
         # This would return to synchronous faults if we did not restore the
         # previous value.
         self.expect("expression setup_mte()", substrs=["= 0"])
-        check_mte_ctrl(1, 0)
+        check_mte_ctrl(1, 0, 0)
+
+        # Store only checking requires FEAT_MTE_STORE_ONLY.
+        if has_store_only:
+            self.runCmd(f"register write mte_ctrl {1 | (1 << 19)}")
+            check_mte_ctrl(0, 0, 1)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
index f10811817c0d..5943b35deab8 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
@@ -1,4 +1,4 @@
-#define COMPRESSED_PAIR_REV 3
+#define COMPRESSED_PAIR_REV 4
 #include <libcxx-simulators-common/compressed_pair.h>
 
 namespace std {
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
index c8d9c2e389a0..f27fc2e3c456 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
@@ -28,7 +28,7 @@ def _run_test(self, defines):
 
 for v in [None, "ALTERNATE_LAYOUT"]:
     for r in range(6):
-        for c in range(4):
+        for c in range(5):
             name = "test_r%d_c%d" % (r, c)
             defines = ["REVISION=%d" % r, "COMPRESSED_PAIR_REV=%d" % c]
             if v:
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
index cf431e524069..b19c05159670 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
@@ -209,7 +209,7 @@ template <class _CharT, class _Traits, class _Allocator> class basic_string {
   __long &getLongRep() {
 #if COMPRESSED_PAIR_REV == 0
     return __r_.first().__l;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
     return __rep_.__l;
 #endif
   }
@@ -217,14 +217,14 @@ template <class _CharT, class _Traits, class _Allocator> class basic_string {
   __short &getShortRep() {
 #if COMPRESSED_PAIR_REV == 0
     return __r_.first().__s;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
     return __rep_.__s;
 #endif
   }
 
 #if COMPRESSED_PAIR_REV == 0
   std::__lldb::__compressed_pair<__rep, allocator_type> __r_;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
   _LLDB_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
 #endif
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
index e623c3a1413b..1e25ac947203 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
@@ -26,7 +26,7 @@ def _run_test(self, defines):
         )
 
 
-for r in range(4):
+for r in range(5):
     name = "test_r%d" % r
     defines = ["COMPRESSED_PAIR_REV=%d" % r]
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
index 3d174a91cc26..bd840aaceffa 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
@@ -20,8 +20,7 @@ template <class _Tp, class _Dp = default_delete<_Tp>> class unique_ptr {
   std::__lldb::__compressed_pair<pointer, deleter_type> __ptr_;
   explicit unique_ptr(pointer __p) noexcept
       : __ptr_(__p, std::__lldb::__value_init_tag()) {}
-#elif COMPRESSED_PAIR_REV == 1 || COMPRESSED_PAIR_REV == 2 ||                  \
-    COMPRESSED_PAIR_REV == 3
+#else
   _LLDB_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
   explicit unique_ptr(pointer __p) noexcept : __ptr_(__p), __deleter_() {}
 #endif
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
index bfdc8229094f..825e1a4b79fd 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
+++ b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
@@ -10,8 +10,8 @@
 class AArch64LinuxMTEMemoryTagCoreFileTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    MTE_BUF_ADDR = hex(0xFFFF82C74000)
-    BUF_ADDR = hex(0xFFFF82C73000)
+    MTE_BUF_ADDR = hex(0xFFFFA733B000)
+    BUF_ADDR = hex(0xFFFFA733A000)
 
     @skipIfLLVMTargetMissing("AArch64")
     def test_mte_tag_core_file_memory_region(self):
@@ -215,7 +215,7 @@ def test_mte_tag_fault_reason(self):
         self.expect(
             "bt",
             substrs=[
-                "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address=0xffff82c74010)"
+                "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address=0xffffa733b010)"
             ],
         )
 
@@ -231,12 +231,15 @@ def test_mte_ctrl_register(self):
         self.runCmd("target create --core core.mte")
         # The expected value is:
         # * Allowed tags value of 0xFFFF, shifted up by 3 resulting in 0x7fff8.
+        # * Bit 19 set to 0, which means that store only checking is disabled.
         # * Bit 1 set to enable synchronous tag faults.
         # * Bit 0 set to enable the tagged address ABI.
         expected = ["mte_ctrl = 0x000000000007fffb"]
 
         if self.hasXMLSupport():
-            expected.append("(TAGS = 65535, TCF = TCF_SYNC, TAGGED_ADDR_ENABLE = 1)")
+            expected.append(
+                "(STORE_ONLY = 0, TAGS = 65535, TCF = TCF_SYNC, TAGGED_ADDR_ENABLE = 1)"
+            )
 
         self.expect("register read mte_ctrl", substrs=expected)
 
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/core.mte b/lldb/test/API/linux/aarch64/mte_core_file/core.mte
index 84a3266667e7..188d06d11c71 100644
Binary files a/lldb/test/API/linux/aarch64/mte_core_file/core.mte and b/lldb/test/API/linux/aarch64/mte_core_file/core.mte differ
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/core.nomte b/lldb/test/API/linux/aarch64/mte_core_file/core.nomte
index 201f2880e6dc..454ff8361cc3 100644
Binary files a/lldb/test/API/linux/aarch64/mte_core_file/core.nomte and b/lldb/test/API/linux/aarch64/mte_core_file/core.nomte differ
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/main.c b/lldb/test/API/linux/aarch64/mte_core_file/main.c
index 6537edd7bdb9..597459459bb0 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/main.c
+++ b/lldb/test/API/linux/aarch64/mte_core_file/main.c
@@ -23,7 +23,7 @@
 
 int main(int argc, char const *argv[]) {
 #ifdef NO_MTE
-  *(char *)(0) = 0;
+  __builtin_trap();
 #endif
 
   if (prctl(PR_SET_TAGGED_ADDR_CTRL,
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 0d2774b28171..20a75f4076e4 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -131,7 +131,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 2, which is an anonymous block
         self.continue_to_breakpoint(breakpoint_2)
@@ -169,7 +169,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 3, which is inside a_function
         self.continue_to_breakpoint(breakpoint_3)
@@ -195,7 +195,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("list + 1")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
diff --git a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
index d4fcf78d01b8..c29b51219d19 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
@@ -15,7 +15,7 @@
 # RUN: %clang_host -fmodules -Xclang -fmodules-cache-path=%t/cache -I%t -g -gmodules %t/b.m -o %t/b.o -c
 # RUN: %clang_host %t/a.o %t/b.o -o %t/a.out
 # RUN: rm -rf %t/cache
-# RUN: %lldb %t/a.out -o "b main" -o run -o "p a" -o "p b" -o q 2>&1 | FileCheck %s
+# RUN: %lldb %t/a.out -o "b main" -o run -o "expr a" -o "expr b" -o q 2>&1 | FileCheck %s
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
 # CHECK-NOT: /cache/{{.*}}/C-{.*}.pcm' does not exist
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 3f8201fa426f..cb945b578e46 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -180,28 +180,29 @@ if ("flang" IN_LIST LLVM_ENABLE_PROJECTS)
 endif()
 
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
-  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
+  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated now, and will "
+    "become a fatal error in a future release. Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
     "https://libc.llvm.org/ for building the runtimes.")
 endif()
 
 if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=compiler-rt is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=compiler-rt or see the instructions at "
     "https://compiler-rt.llvm.org/ for building the runtimes.")
 endif()
 
 if ("offload" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=offload is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=offload or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
 
 if ("openmp" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=openmp is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=openmp or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
@@ -214,7 +215,7 @@ endif ()
 
 if ("libclc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libclc is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libclc or see the instructions at "
     "https://libclc.llvm.org/ for building the runtimes.")
 endif()
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6191a915051f..8e97f25ba5e4 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -154,10 +154,23 @@ Changes to the MIPS Backend
 ---------------------------
 
 * `-mcpu=i6400` and `-mcpu=i6500` were added.
+* Added support for `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
 
 Changes to the PowerPC Backend
 ------------------------------
 
+* Add spill and restore for DMR and DMRp registers.
+* Prototype various Dense Math Facility instructions, and intrinsics for basic enablement, insert/extract, integer and FP calculations.
+* Add prototype for Dense Math Facility cryptography instructions.
+* Implement load/stores prototype for v1024i1, v2048i1.
+* Support conversion between f16 and f128.
+* Change default for auto gen stxvp for cpu=future.
+* Setup initial JITLink build support for XCOFF.
+* Add an API to derive the default feature set from a CPU name within the TargetParser
+  (e.g. `pwr10` -> `+vsx`,`+isa3_1`,`+mma`). Clang now uses this to populate the `target-feature`
+  list when `-mcpu` is provided for PowerPC.
+* Various bug fixes and codegen improvements.
+
 Changes to the RISC-V Backend
 -----------------------------
 
@@ -219,6 +232,7 @@ Changes to the RISC-V Backend
 * Removed -mattr=+no-rvc-hints that could be used to disable parsing and generation of RVC hints.
 * Adds assembler support for the Andes `XAndesvsintload` (Andes Vector INT4 Load extension).
 * Adds assembler support for the Andes `XAndesbfhcvt` (Andes Scalar BFLOAT16 Conversion extension).
+* Add combine for shadd family of instructions.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -228,11 +242,14 @@ Changes to the Windows Target
 
 * `fp128` is now passed indirectly, meaning it uses the same calling convention
   as `i128`.
+* Added support for `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
 
 Changes to the X86 Backend
 --------------------------
 
 * `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well.
+* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack
+  alignment.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -311,10 +328,21 @@ Changes to LLDB
     stop reason = SIGSEGV: sent by tkill system call (sender pid=649752, uid=2667987)
   ```
 * ELF Cores can now have their siginfo structures inspected using `thread siginfo`.
+* LLDB now uses
+  [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the
+  default implementation for 'frame variable'. This should not change the
+  behavior of 'frame variable' at all, at this time. To revert to using the
+  old implementation use: `settings set target.experimental.use-DIL false`.
 * Disassembly of unknown instructions now produces `<unknown>` instead of
   nothing at all
 * Changed the format of opcode bytes to match llvm-objdump when disassembling
   RISC-V code with `disassemble`'s `--byte` option.
+* LLDB added native support for the Model Context Protocol  (MCP). An MCP
+  server can be started with the `protocol-server start MCP` command.
+* On AArch64 Linux, LLDB will now show the state of the `STORE_ONLY` field of
+  `mte_ctrl`. This will only be shown on hardware that has the
+  `FEAT_MTE_STORE_ONLY` architecture feature.
+
 
 ### Changes to lldb-dap
 
@@ -329,6 +357,18 @@ Changes to Sanitizers
 
 Other Changes
 -------------
+* A new ThinLTO backend has been added to implement the
+  [Integrated Distributed ThinLTO](https://llvm.org/docs/DTLTO.html) (DTLTO)
+  feature. This new backend delegates the ThinLTO backend compilation jobs to an
+  external process (the distributor), which in turn coordinates distribution
+  through a system such as Incredibuild. A JSON interface is used for
+  communication with the distributor.
+  ([#47468](https://github.com/llvm/llvm-project/issues/47468)).
+
+Changes to the Profile Runtime
+---------------------
+
+* On AIX, avoid using mmap when reading profile files from a non-local filesystem.
 
 External Open Source Projects Using LLVM {{env.config.release}}
 ===============================================================
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index b985292ccee4..1dc73205a0eb 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -47,6 +47,8 @@ enum class RecurKind {
   FMul,     ///< Product of floats.
   FMin,     ///< FP min implemented in terms of select(cmp()).
   FMax,     ///< FP max implemented in terms of select(cmp()).
+  FMinNum,  ///< FP min with llvm.minnum semantics including NaNs.
+  FMaxNum,  ///< FP max with llvm.maxnum semantics including NaNs.
   FMinimum, ///< FP min with llvm.minimum semantics
   FMaximum, ///< FP max with llvm.maximum semantics
   FMinimumNum, ///< FP min with llvm.minimumnum semantics
@@ -250,6 +252,7 @@ class RecurrenceDescriptor {
   /// Returns true if the recurrence kind is a floating-point min/max kind.
   static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
     return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
+           Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
            Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
            Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
   }
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e..9e3d9196cc18 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -323,10 +323,11 @@ class MachineBasicBlock
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  /// Returns true if the original IR terminator is an `indirectbr`. This
-  /// typically corresponds to a `goto` in C, rather than jump tables.
-  bool terminatorIsComputedGoto() const {
-    return back().isIndirectBranch() &&
+  /// Returns true if the original IR terminator is an `indirectbr` with
+  /// successor blocks. This typically corresponds to a `goto` in C, rather than
+  /// jump tables.
+  bool terminatorIsComputedGotoWithSuccessors() const {
+    return back().isIndirectBranch() && !succ_empty() &&
            llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
              return Succ->isIRBlockAddressTaken();
            });
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index cdc80c88b742..611bfe3f8ace 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -795,25 +795,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // assigned to which leaf constructs.
 
   // [5.2:340:33]
-  auto canMakePrivateCopy = [](llvm::omp::Clause id) {
-    switch (id) {
-    // Clauses with "privatization" property:
-    case llvm::omp::Clause::OMPC_firstprivate:
-    case llvm::omp::Clause::OMPC_in_reduction:
-    case llvm::omp::Clause::OMPC_lastprivate:
-    case llvm::omp::Clause::OMPC_linear:
-    case llvm::omp::Clause::OMPC_private:
-    case llvm::omp::Clause::OMPC_reduction:
-    case llvm::omp::Clause::OMPC_task_reduction:
-      return true;
-    default:
-      return false;
-    }
-  };
-
   bool applied = applyIf(node, [&](const auto &leaf) {
     return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return canMakePrivateCopy(n->id);
+      return llvm::omp::isPrivatizingClause(n->id);
     });
   });
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 35dafc6d246f..d44c33301bde 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -48,6 +48,22 @@ static constexpr inline bool canHaveIterator(Clause C) {
   }
 }
 
+// Can clause C create a private copy of a variable.
+static constexpr inline bool isPrivatizingClause(Clause C) {
+  switch (C) {
+  case OMPC_firstprivate:
+  case OMPC_in_reduction:
+  case OMPC_lastprivate:
+  case OMPC_linear:
+  case OMPC_private:
+  case OMPC_reduction:
+  case OMPC_task_reduction:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static constexpr unsigned FallbackVersion = 52;
 LLVM_ABI ArrayRef<unsigned> getOpenMPVersions();
 
diff --git a/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h b/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
index 8aac9d5b49db..448a6e913eb8 100644
--- a/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
+++ b/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
@@ -135,6 +135,12 @@ template <typename ContextT> class GenericFloatingPointPredicateUtils {
       if (Mode.Input != DenormalMode::IEEE)
         return {Invalid, fcAllFlags, fcAllFlags};
 
+      auto ExactClass = [IsFabs, Src](FPClassTest Mask) {
+        if (IsFabs)
+          Mask = llvm::inverse_fabs(Mask);
+        return exactClass(Src, Mask);
+      };
+
       switch (Pred) {
       case FCmpInst::FCMP_OEQ: // Match x == 0.0
         return exactClass(Src, fcZero);
@@ -151,26 +157,24 @@ template <typename ContextT> class GenericFloatingPointPredicateUtils {
       case FCmpInst::FCMP_UNO:
         return exactClass(Src, fcNan);
       case FCmpInst::FCMP_OGT: // x > 0
-        return exactClass(Src, fcPosSubnormal | fcPosNormal | fcPosInf);
+        return ExactClass(fcPosSubnormal | fcPosNormal | fcPosInf);
       case FCmpInst::FCMP_UGT: // isnan(x) || x > 0
-        return exactClass(Src, fcPosSubnormal | fcPosNormal | fcPosInf | fcNan);
+        return ExactClass(fcPosSubnormal | fcPosNormal | fcPosInf | fcNan);
       case FCmpInst::FCMP_OGE: // x >= 0
-        return exactClass(Src, fcPositive | fcNegZero);
+        return ExactClass(fcPositive | fcNegZero);
       case FCmpInst::FCMP_UGE: // isnan(x) || x >= 0
-        return exactClass(Src, fcPositive | fcNegZero | fcNan);
+        return ExactClass(fcPositive | fcNegZero | fcNan);
       case FCmpInst::FCMP_OLT: // x < 0
-        return exactClass(Src, fcNegSubnormal | fcNegNormal | fcNegInf);
+        return ExactClass(fcNegSubnormal | fcNegNormal | fcNegInf);
       case FCmpInst::FCMP_ULT: // isnan(x) || x < 0
-        return exactClass(Src, fcNegSubnormal | fcNegNormal | fcNegInf | fcNan);
+        return ExactClass(fcNegSubnormal | fcNegNormal | fcNegInf | fcNan);
       case FCmpInst::FCMP_OLE: // x <= 0
-        return exactClass(Src, fcNegative | fcPosZero);
+        return ExactClass(fcNegative | fcPosZero);
       case FCmpInst::FCMP_ULE: // isnan(x) || x <= 0
-        return exactClass(Src, fcNegative | fcPosZero | fcNan);
+        return ExactClass(fcNegative | fcPosZero | fcNan);
       default:
         llvm_unreachable("all compare types are handled");
       }
-
-      return {Invalid, fcAllFlags, fcAllFlags};
     }
 
     const bool IsDenormalRHS = (OrigClass & fcSubnormal) == OrigClass;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a6254eafa490..9b081e9d6544 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -2120,7 +2120,7 @@ defvar X86CommonLibcalls =
 );
 
 defvar Windows32DivRemMulCalls =
-  LibcallImpls<(add WindowsDivRemMulLibcalls),
+  LibcallsWithCC<(add WindowsDivRemMulLibcalls), X86_STDCALL,
   RuntimeLibcallPredicate<"TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()">>;
 
 def X86_32SystemLibrary
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 3a7ca1a69ab8..cae2fbcac1fe 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -136,6 +136,18 @@ class LLVM_ABI MCDisassembler {
                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       raw_ostream &CStream) const = 0;
 
+  /// Returns the disassembly of an instruction bundle for VLIW architectures
+  /// like Hexagon.
+  ///
+  /// \param Instr    - An MCInst to populate with the contents of
+  /// the Bundle with sub-instructions encoded as Inst operands.
+  virtual DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                            ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address,
+                                            raw_ostream &CStream) const {
+    return Fail;
+  }
+
   /// Used to perform separate target specific disassembly for a particular
   /// symbol. May parse any prelude that precedes instructions after the
   /// start of a symbol, or the entire symbol.
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 39f74beca082..8be5de3bf356 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
                   m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
             match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
     };
-    if (isIntMinMaxRecurrenceKind(Kind) ||
-        (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
+    if (isIntMinMaxRecurrenceKind(Kind))
       return isMinMaxPattern(I, Kind, Prev);
-    else if (isFMulAddIntrinsic(I))
+    if (isFPMinMaxRecurrenceKind(Kind)) {
+      InstDesc Res = isMinMaxPattern(I, Kind, Prev);
+      if (!Res.isRecurrence())
+        return InstDesc(false, I);
+      if (HasRequiredFMF())
+        return Res;
+      // We may be able to vectorize FMax/FMin reductions using maxnum/minnum
+      // intrinsics with extra checks ensuring the vector loop handles only
+      // non-NaN inputs.
+      if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMax &&
+               "unexpected recurrence kind for maxnum");
+        return InstDesc(I, RecurKind::FMaxNum);
+      }
+      if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMin &&
+               "unexpected recurrence kind for minnum");
+        return InstDesc(I, RecurKind::FMinNum);
+      }
+      return InstDesc(false, I);
+    }
+    if (isFMulAddIntrinsic(I))
       return InstDesc(Kind == RecurKind::FMulAdd, I,
                       I->hasAllowReassoc() ? nullptr : I);
     return InstDesc(false, I);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9bbb89e37865..3d1408256df8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2096,6 +2096,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
   if (!L->isLoopInvariant(RemAmt))
     return false;
 
+  // Only works if the AddOffset is a loop invaraint
+  if (AddOffset && !L->isLoopInvariant(AddOffset))
+    return false;
+
   // Is the PHI a loop increment?
   auto LoopIncrInfo = getIVIncrement(PN, LI);
   if (!LoopIncrInfo)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b38a4d1c55af..90005bd181f3 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
             !TII->isGlobalMemoryObject(FromMI) &&
             !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
           SDep Pred = Dep;
-          Pred.setSUnit(Src);
-          Dst->addPred(Pred);
+          Pred.setSUnit(From);
+          To->addPred(Pred);
         }
       }
     }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8de2c48581a1..9fa96e737296 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -587,12 +587,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
       break;
     case Intrinsic::exp:
     case Intrinsic::exp2:
+    case Intrinsic::log:
       Changed |= forEachCall(F, [&](CallInst *CI) {
         Type *Ty = CI->getArgOperand(0)->getType();
         if (!isa<ScalableVectorType>(Ty))
           return false;
         const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
         unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
+        assert(Op != ISD::DELETED_NODE && "unsupported intrinsic");
         if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
           return false;
         return lowerUnaryVectorIntrinsicAsLoop(M, CI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 23812d795f5f..91fd2d843f44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16717,7 +16717,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
-  // conditions 1) one-use, and 2) does not produce poison then push
+  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+  // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
   // the freeze through to the operands that are not guaranteed non-poison.
   // NOTE: we will strip poison-generating flags, so ignore them here.
   if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16725,6 +16726,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
+  // TOOD: we should always allow multiple operands, however this increases the
+  // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+  // below causing later nodes that share frozen operands to fold again and no
+  // longer being able to confirm other operands are not poison due to recursion
+  // depth limits on isGuaranteedNotToBeUndefOrPoison.
+  bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+      N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::BUILD_PAIR ||
+      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+      N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
   // instead pick undef values to keep those properties, while at the same time
@@ -16757,6 +16770,10 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       MaybePoisonOperandNumbers.push_back(OpNo);
     if (!HadMaybePoisonOperands)
       continue;
+    if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+      // Multiple maybe-poison ops when not allowed - bail out.
+      return SDValue();
+    }
   }
   // NOTE: the whole op may be not guaranteed to not be undef or poison because
   // it could create undef or poison due to it's poison-generating flags.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2cad36eff9c8..fe357106bda6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
   case ISD::STRICT_FMINNUM:
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
@@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves.
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9b537248d4ab..4eaa79890e00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -677,6 +677,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
       SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
 
   // clang-format off
+  void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FACOS     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FASIN     (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165..8cbdadd97981 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,12 +604,23 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   bool HasComputedGoto = false;
   if (!TailBB.empty()) {
     HasIndirectbr = TailBB.back().isIndirectBranch();
-    HasComputedGoto = TailBB.terminatorIsComputedGoto();
+    HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
   }
 
   if (HasIndirectbr && PreRegAlloc)
     MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilation process to speed up edge based data
+  // flow. If we do not unfactor them again, it can seriously pessimize code
+  // with many computed jumps in the source code, such as interpreters.
+  // Therefore we do not restrict the computed gotos.
+  bool DupComputedGotoLate =
+      HasComputedGoto && MF->getTarget().getTargetTriple().isOSDarwin();
+  if (DupComputedGotoLate && !PreRegAlloc)
+    MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
@@ -663,12 +674,10 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  // NB. This basically unfactors computed gotos that were factored early on in
-  // the compilation process to speed up edge based data flow. If we do not
-  // unfactor them again, it can seriously pessimize code with many computed
-  // jumps in the source code, such as interpreters. Therefore we do not
-  // restrict the computed gotos.
-  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+  bool CheckSuccessorAndPredecessorSize =
+      DupComputedGotoLate ? PreRegAlloc : !HasComputedGoto;
+  if (CheckSuccessorAndPredecessorSize &&
+      TailBB.pred_size() > TailDupPredSize &&
       TailBB.succ_size() > TailDupSuccSize) {
     // If TailBB or any of its successors contains a phi, we may have to add a
     // large number of additional phis with additional incoming values.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6feeb19bb858..ca2a57e9b7b2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() {
                         ISD::SDIVFIX,        ISD::SDIVFIXSAT,
                         ISD::UDIVFIX,        ISD::UDIVFIXSAT,
                         ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                        ISD::IS_FPCLASS},
+                        ISD::IS_FPCLASS,     ISD::FCBRT,
+                        ISD::FLOG,           ISD::FLOG2,
+                        ISD::FLOG10,         ISD::FEXP,
+                        ISD::FEXP2,          ISD::FEXP10,
+                        ISD::FFLOOR,         ISD::FNEARBYINT,
+                        ISD::FCEIL,          ISD::FRINT,
+                        ISD::FTRUNC,         ISD::FROUNDEVEN,
+                        ISD::FTAN,           ISD::FACOS,
+                        ISD::FASIN,          ISD::FATAN,
+                        ISD::FCOSH,          ISD::FSINH,
+                        ISD::FTANH,          ISD::FATAN2},
                        VT, Expand);
 
     // Overflow operations default to expand
@@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() {
 
     // These operations default to expand for vector types.
     if (VT.isVector())
-      setOperationAction(
-          {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
-           ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
-           ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
-           ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
-           ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
-          VT, Expand);
+      setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+                          ISD::ANY_EXTEND_VECTOR_INREG,
+                          ISD::SIGN_EXTEND_VECTOR_INREG,
+                          ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR,
+                          ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+                         VT, Expand);
 
       // Constrained floating-point operations default to expand.
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
@@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() {
                      {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
                      Expand);
 
-  // These library functions default to expand.
-  setOperationAction({ISD::FCBRT,      ISD::FLOG,  ISD::FLOG2,  ISD::FLOG10,
-                      ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
-                      ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
-                      ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH,
-                      ISD::FATAN2},
-                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
   // Insert custom handling default for llvm.canonicalize.*.
   setOperationAction(ISD::FCANONICALIZE,
                      {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand);
@@ -1922,6 +1922,8 @@ int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const {
     return ISD::FEXP;
   case Intrinsic::exp2:
     return ISD::FEXP2;
+  case Intrinsic::log:
+    return ISD::FLOG;
   default:
     return ISD::DELETED_NODE;
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8004077b9266..9afbefb8c08e 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -3185,12 +3185,6 @@ void Verifier::visitFunction(const Function &F) {
     CheckDI(SP->describes(&F),
             "!dbg attachment points at wrong subprogram for function", N, &F,
             &I, DL, Scope, SP);
-
-    if (DL->getAtomGroup())
-      CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
-              "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
-              "Instructions enabled",
-              DL, DL->getScope()->getSubprogram());
   };
   for (auto &BB : F)
     for (auto &I : BB) {
@@ -5492,6 +5486,15 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
+
+    if (auto *DL = dyn_cast<DILocation>(N)) {
+      if (DL->getAtomGroup()) {
+        CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
+                "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
+                "Instructions enabled",
+                DL, DL->getScope()->getSubprogram());
+      }
+    }
   }
 
   if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index a19659174409..a877eb7938eb 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -361,6 +361,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
         if (BBeforeRelax && AAfterRelax)
           return;
       }
+      const auto *RF = dyn_cast<MCRelaxableFragment>(F);
+      if (RF && RF->isLinkerRelaxable())
+        return;
       if (&*F == FA) {
         // If FA and FB belong to the same subsection, the loop will find FA and
         // we can resolve the difference.
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index c59fabe5df1b..28cddb939646 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -70,6 +70,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << "\n  Fixup @" << F.getOffset() << " Value:";
       F.getValue()->print(OS, nullptr);
       OS << " Kind:" << F.getKind();
+      if (F.isLinkerRelaxable())
+        OS << " LinkerRelaxable";
     }
   };
 
@@ -113,6 +115,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   }
   case MCFragment::FT_Relaxable:  {
     const auto *F = cast<MCRelaxableFragment>(this);
+    if (F->isLinkerRelaxable())
+      OS << " LinkerRelaxable";
     OS << " Size:" << F->getContents().size() << ' ';
     F->getInst().dump_pretty(OS);
     printFixups(F->getFixups());
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 44a82f75576b..d9c39bbedf37 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -408,6 +408,13 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
       Inst, IF->getContentsForAppending(), Fixups, STI);
   IF->doneAppending();
   IF->appendFixups(Fixups);
+
+  for (auto &Fixup : Fixups) {
+    if (Fixup.isLinkerRelaxable()) {
+      IF->setLinkerRelaxable();
+      getCurrentSectionOnly()->setLinkerRelaxable();
+    }
+  }
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index beb472b7c7de..a7330692571d 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -63,6 +63,8 @@ LLVM_DUMP_METHOD void MCSection::dump(
   raw_ostream &OS = errs();
 
   OS << "MCSection Name:" << getName();
+  if (isLinkerRelaxable())
+    OS << " LinkerRelaxable";
   for (auto &F : *this) {
     OS << '\n';
     F.dump();
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
index 62a71d41ded5..9b55f76e5840 100644
--- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
     // it is, find the target section unique id.
     const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
     const coff_aux_weak_external *WE = SymRef.getWeakExternal();
-    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) {
       int32_t Index = SD->getNumber(IsBigObj);
       if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
         return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index d00580fe3519..19918aa708b2 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8],
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
   MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
     return;
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index deed079e468a..dd71e729f208 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out);
 
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__CYGWIN__)
 LLVM_LIBRARY_VISIBILITY
 void blake3_xof_many_avx512(const uint32_t cv[8],
                             const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
index 9629e1083686..ee36721f8757 100644
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -245,10 +245,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
       counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }
 
-void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint32_t key[8], uint64_t counter,
-                       bool increment_counter, uint8_t flags,
-                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              bool increment_counter, uint8_t flags,
+                              uint8_t flags_start, uint8_t flags_end,
+                              uint8_t *out) {
   uint32x4_t h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
diff --git a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
index d5be360815ad..d24657465dd8 100644
--- a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
+++ b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
@@ -10,7 +10,9 @@
 #define blake3_hasher llvm_blake3_hasher
 #define blake3_chunk_state llvm_blake3_chunk_state
 #define blake3_compress_in_place llvm_blake3_compress_in_place
+#define blake3_compress_subtree_wide llvm_blake3_compress_subtree_wide
 #define blake3_compress_xof llvm_blake3_compress_xof
+#define blake3_xof_many llvm_blake3_xof_many
 #define blake3_hash_many llvm_blake3_hash_many
 #define blake3_simd_degree llvm_blake3_simd_degree
 #define blake3_compress_in_place_portable llvm_blake3_compress_in_place_portable
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 601f11f6d23c..1c4645ad8364 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
     std::unique_ptr<MB> Result(
         new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
             RequiresNullTerminator, FD, MapSize, Offset, EC));
-    if (!EC)
-      return std::move(Result);
+    if (!EC) {
+      // On at least Linux, and possibly on other systems, mmap may return pages
+      // from the page cache that are not properly filled with trailing zeroes,
+      // if some prior user of the page wrote non-zero bytes. Detect this and
+      // don't use mmap in that case.
+      if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0')
+        return std::move(Result);
+    }
   }
 
 #ifdef __MVS__
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d..7de66ccbf6f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -671,8 +671,8 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   }
 
   if (PRFX) {
-    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
     transferImpOps(MI, PRFX, DOP);
+    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
   } else
     transferImpOps(MI, DOP, DOP);
 
@@ -1591,18 +1591,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
          "Non-writeback variants of STGloop / STZGloop should not "
          "survive past PrologEpilogInserter.");
    case AArch64::STR_ZZZZXI:
+   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
    case AArch64::STR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
+   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
    case AArch64::STR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
+   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
+   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
    case AArch64::LDR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa7..5420545cc3ce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2482,8 +2482,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LDR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZXI:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDRBBui:
   case AArch64::LDRBui:
   case AArch64::LDRDui:
@@ -2525,8 +2527,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STR_PXI:
   case AArch64::STR_ZXI:
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STR_ZZZXI:
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -4318,7 +4322,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     break;
   // SVE
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 4);
     MinOffset = -256;
@@ -4332,7 +4338,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MaxOffset = 253;
     break;
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 2);
     MinOffset = -256;
@@ -5559,8 +5567,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5596,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5752,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5781,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZZXI;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b..0c4b4f4c3ed8 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
   // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
   // AArch64ExpandPseudoInsts.
   let mayLoad = 1, hasSideEffects = 0 in {
-    def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs   PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
-    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins   PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 095682334679..2409cc862f21 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case NeoverseV2:
   case NeoverseV3:
+    CacheLineSize = 64;
     EpilogueVectorizationMinVF = 8;
     MaxInterleaveFactor = 4;
     ScatterOverhead = 13;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 07baf29ce701..193be62b28ad 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3724,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-    bool HasRealUse, const Instruction *I, Value *Scalar,
+    const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -3744,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     }
 
     // The element at index zero is already inside the vector.
-    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // - For a insert-element or extract-element
     // instruction that extracts integers, an explicit FPR -> GPR move is
     // needed. So it has non-zero cost.
-    // - For the rest of cases (virtual instruction or element type is float),
-    // consider the instruction free.
-    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
+    if (Index == 0 && !Val->getScalarType()->isIntegerTy())
       return 0;
 
     // This is recognising a LD1 single-element structure to one lane of one
@@ -3899,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    unsigned Index,
                                                    const Value *Op0,
                                                    const Value *Op1) const {
-  bool HasRealUse =
-      Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
+  // Treat insert at lane 0 into a poison vector as having zero cost. This
+  // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
+  // single dup) are treated as cheap.
+  if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
+      isa<PoisonValue>(Op0))
+    return 0;
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
-                                  Scalar, ScalarUserAndIdx);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
+                                  ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index) const {
-  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
-                                  true /* HasRealUse */, &I);
+  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ff0ab68a16a8..b27eb2ef7a39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
 
   // A helper function called by 'getVectorInstrCost'.
   //
-  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
-  // indicates whether the vector instruction is available in the input IR or
-  // just imaginary in vectorizer passes.
-  /// \param ScalarUserAndIdx encodes the information about extracts from a
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
+  // \param ScalarUserAndIdx encodes the information about extracts from a
   /// vector with 'Scalar' being the value being extracted,'User' being the user
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-      bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
+      const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
 
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3414fe758eff..7b93382d1281 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -392,8 +392,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
-                      ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
-                     MVT::f32, Legal);
+                      ISD::FROUNDEVEN, ISD::FTRUNC},
+                     {MVT::f16, MVT::f32}, Legal);
+  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -413,9 +414,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
-  if (Subtarget->has16BitInsts())
+  if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
-  else {
+    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+  } else {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
     setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fb72bab03e75..9593038ff2c9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
 
     setOperationAction(ISD::FROUND, MVT::f16, Legal);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
+    setOperationAction(ISD::FRINT, MVT::f16, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f16, Legal);
   }
 
   if (Subtarget->hasNEON()) {
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a252e7..25ad9eccbce5 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: {
         // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
         // us to  fold the constant into the cmp instruction.
-        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
         CC = ISD::SETGE;
         break;
       }
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
     // fold the constant into the cmp instruction.
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
-      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+      // already to something else. Assert to make sure this assumption holds.
+      assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+      RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
       CC = ISD::SETUGE;
       break;
     }
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5bd31707acb6..bcddb540d35d 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -43,12 +43,12 @@ namespace {
 class HexagonDisassembler : public MCDisassembler {
 public:
   std::unique_ptr<MCInstrInfo const> const MCII;
-  std::unique_ptr<MCInst *> CurrentBundle;
+  mutable std::unique_ptr<MCInst> CurrentBundle;
   mutable MCInst const *CurrentExtender;
 
   HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                       MCInstrInfo const *MCII)
-      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *),
+      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr),
         CurrentExtender(nullptr) {}
 
   DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
@@ -57,7 +57,23 @@ class HexagonDisassembler : public MCDisassembler {
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const override;
+
+  DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                    ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                    raw_ostream &CStream) const override;
+
   void remapInstruction(MCInst &Instr) const;
+
+private:
+  bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                  uint64_t &BytesToSkip, raw_ostream &CS) const;
+
+  void resetBundle() const {
+    CurrentBundle.reset();
+    CurrentInstruction = nullptr;
+  }
+
+  mutable MCOperand *CurrentInstruction = nullptr;
 };
 
 static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
@@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() {
                                          createHexagonDisassembler);
 }
 
-DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 ArrayRef<uint8_t> Bytes,
-                                                 uint64_t Address,
-                                                 raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  DecodeStatus Result = DecodeStatus::Success;
+bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                     uint64_t &BytesToSkip,
+                                     raw_ostream &CS) const {
   bool Complete = false;
-  Size = 0;
+  DecodeStatus Result = DecodeStatus::Success;
 
-  *CurrentBundle = &MI;
-  MI.setOpcode(Hexagon::BUNDLE);
-  MI.addOperand(MCOperand::createImm(0));
+  CurrentBundle.reset(new MCInst);
+  CurrentBundle->setOpcode(Hexagon::BUNDLE);
+  CurrentBundle->addOperand(MCOperand::createImm(0));
   while (Result == Success && !Complete) {
     if (Bytes.size() < HEXAGON_INSTR_SIZE)
-      return MCDisassembler::Fail;
+      return false;
     MCInst *Inst = getContext().createMCInst();
-    Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete);
-    MI.addOperand(MCOperand::createInst(Inst));
-    Size += HEXAGON_INSTR_SIZE;
+    Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS,
+                                  Complete);
+    CurrentBundle->addOperand(MCOperand::createInst(Inst));
+    BytesToSkip += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
   if (Result == MCDisassembler::Fail)
-    return Result;
-  if (Size > HEXAGON_MAX_PACKET_SIZE)
-    return MCDisassembler::Fail;
+    return false;
+  if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE)
+    return false;
 
   const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
   const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
-  HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
+  HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle,
                            *getContext().getRegisterInfo(), false);
   if (!Checker.check())
-    return MCDisassembler::Fail;
-  remapInstruction(MI);
+    return false;
+  remapInstruction(*CurrentBundle);
+  return true;
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+
+  if (!CurrentBundle) {
+    if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+      Size = BytesToSkip;
+      resetBundle();
+      return MCDisassembler::Fail;
+    }
+    CurrentInstruction = (CurrentBundle->begin() + 1);
+  }
+
+  MI = *(CurrentInstruction->getInst());
+  Size = HEXAGON_INSTR_SIZE;
+  if (++CurrentInstruction == CurrentBundle->end())
+    resetBundle();
   return MCDisassembler::Success;
 }
 
+DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI,
+                                                       uint64_t &Size,
+                                                       ArrayRef<uint8_t> Bytes,
+                                                       uint64_t Address,
+                                                       raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+  assert(!CurrentBundle);
+
+  if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+    Size = BytesToSkip;
+    resetBundle();
+    return MCDisassembler::Fail;
+  }
+
+  MI = *CurrentBundle;
+  Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI);
+  resetBundle();
+
+  return Success;
+}
+
 void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
   for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) {
     auto &MI = const_cast<MCInst &>(*I.getInst());
@@ -465,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     MI.insert(MI.begin() + 1,
               MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
+  case Hexagon::Y4_crswap10:
+    MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+    break;
   default:
     break;
   }
@@ -482,7 +546,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     unsigned Offset = 1;
     bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
     bool PrevVector = false;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle);
     auto i = Instructions.end() - 1;
     for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
       if (i == n)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3bc59..e285e0454369 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       R = N;
       break;
     }
+    case ISD::AssertSext: {
+      EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() == 32)
+        R = N.getOperand(0);
+      else
+        return false;
+      break;
+    }
+
     default:
       return false;
   }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 9030e43b7149..f83e06cd3d93 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) {
 void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                    StringRef Annot, const MCSubtargetInfo &STI,
                                    raw_ostream &OS) {
-  assert(HexagonMCInstrInfo::isBundle(*MI));
-  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
-  assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
-  HasExtender = false;
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
-    MCInst const &MCI = *I.getInst();
-    if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
-      printInstruction(MCI.getOperand(1).getInst(), Address, OS);
-      OS << '\v';
-      HasExtender = false;
-      printInstruction(MCI.getOperand(0).getInst(), Address, OS);
-    } else
-      printInstruction(&MCI, Address, OS);
-    HasExtender = HexagonMCInstrInfo::isImmext(MCI);
-    OS << "\n";
-  }
-
-  bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
-  bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
-  if (IsLoop0) {
-    OS << (IsLoop1 ? " :endloop01" : " :endloop0");
-  } else if (IsLoop1) {
-    OS << " :endloop1";
+  if (HexagonMCInstrInfo::isDuplex(MII, *MI)) {
+    printInstruction(MI->getOperand(1).getInst(), Address, OS);
+    OS << '\v';
+    HasExtender = false;
+    printInstruction(MI->getOperand(0).getInst(), Address, OS);
+  } else {
+    printInstruction(MI, Address, OS);
   }
+  HasExtender = HexagonMCInstrInfo::isImmext(*MI);
+  if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_PACKET_END)
+    HasExtender = false;
 }
 
 void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 980df819b2c2..bfea50e2d6dc 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -252,8 +252,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     std::string Buffer;
     {
       raw_string_ostream TempStream(Buffer);
-      InstPrinter.printInst(&Inst, Address, "", STI, TempStream);
+      for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+        InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream);
+        TempStream << "\n";
+      }
+    }
+
+    std::string LoopString = "";
+    bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst);
+    bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst);
+    if (IsLoop0) {
+      LoopString += (IsLoop1 ? " :endloop01" : " :endloop0");
+    } else if (IsLoop1) {
+      LoopString += " :endloop1";
     }
+
     StringRef Contents(Buffer);
     auto PacketBundle = Contents.rsplit('\n');
     auto HeadTail = PacketBundle.first.split('\n');
@@ -275,9 +288,9 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     }
 
     if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
-      OS << "\n\t} :mem_noshuf" << PacketBundle.second;
+      OS << "\n\t} :mem_noshuf" << LoopString;
     else
-      OS << "\t}" << PacketBundle.second;
+      OS << "\t}" << LoopString;
   }
 
   void finish() override { finishAttributeSection(); }
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index ac5e7f3891c7..1493bf4cba69 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
   // representable an 11-bit signed field rather than 12-bits.
-  if (!isInt<11>(MFI.estimateStackSize(MF)))
+  // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit
+  // signed field is fine.
+  unsigned EstimateStackSize = MFI.estimateStackSize(MF);
+  if (!isInt<11>(EstimateStackSize) ||
+      (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() &&
+       !isInt<7>(EstimateStackSize)))
     ScavSlotsNum = std::max(ScavSlotsNum, 1u);
 
   // For CFR spill.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987fbf683..12cf04bbbab5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -4563,6 +4563,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
   llvm_unreachable("Unexpected node type for vXi1 sign extension");
 }
 
+static SDValue
+performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const LoongArchSubtarget &Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse())
+    return SDValue();
+
+  bool UseLASX;
+  unsigned Opc = ISD::DELETED_NODE;
+  EVT CmpVT = Src.getOperand(0).getValueType();
+  EVT EltVT = CmpVT.getVectorElementType();
+
+  if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128)
+    UseLASX = false;
+  else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
+           CmpVT.getSizeInBits() == 256)
+    UseLASX = true;
+  else
+    return SDValue();
+
+  SDValue SrcN1 = Src.getOperand(1);
+  switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
+  default:
+    break;
+  case ISD::SETEQ:
+    // x == 0 => not (vmsknez.b x)
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
+    break;
+  case ISD::SETGT:
+    // x > -1 => vmskgez.b x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETGE:
+    // x >= 0 => vmskgez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETLT:
+    // x < 0 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETLE:
+    // x <= -1 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETNE:
+    // x != 0 => vmsknez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
+    break;
+  }
+
+  if (Opc == ISD::DELETED_NODE)
+    return SDValue();
+
+  SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0));
+  EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+  V = DAG.getZExtOrTrunc(V, DL, T);
+  return DAG.getBitcast(VT, V);
+}
+
 static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const LoongArchSubtarget &Subtarget) {
@@ -4577,110 +4651,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
     return SDValue();
 
-  unsigned Opc = ISD::DELETED_NODE;
   // Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible
+  SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget);
+  if (Res)
+    return Res;
+
+  // Generate vXi1 using [X]VMSKLTZ
+  MVT SExtVT;
+  unsigned Opc;
+  bool UseLASX = false;
+  bool PropagateSExt = false;
+
   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) {
-    bool UseLASX;
     EVT CmpVT = Src.getOperand(0).getValueType();
-    EVT EltVT = CmpVT.getVectorElementType();
-
-    if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128)
-      UseLASX = false;
-    else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
-             CmpVT.getSizeInBits() <= 256)
-      UseLASX = true;
-    else
+    if (CmpVT.getSizeInBits() > 256)
       return SDValue();
-
-    SDValue SrcN1 = Src.getOperand(1);
-    switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
-    default:
-      break;
-    case ISD::SETEQ:
-      // x == 0 => not (vmsknez.b x)
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
-      break;
-    case ISD::SETGT:
-      // x > -1 => vmskgez.b x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETGE:
-      // x >= 0 => vmskgez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETLT:
-      // x < 0 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETLE:
-      // x <= -1 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETNE:
-      // x != 0 => vmsknez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
-      break;
-    }
   }
 
-  // Generate vXi1 using [X]VMSKLTZ
-  if (Opc == ISD::DELETED_NODE) {
-    MVT SExtVT;
-    bool UseLASX = false;
-    bool PropagateSExt = false;
-    switch (SrcVT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v2i1:
-      SExtVT = MVT::v2i64;
-      break;
-    case MVT::v4i1:
-      SExtVT = MVT::v4i32;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v4i64;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v8i1:
-      SExtVT = MVT::v8i16;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v8i32;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v16i1:
-      SExtVT = MVT::v16i8;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v16i16;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v32i1:
-      SExtVT = MVT::v32i8;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v2i1:
+    SExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    SExtVT = MVT::v4i32;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v4i64;
       UseLASX = true;
-      break;
-    };
-    if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX())
-      return SDValue();
-    Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
-                        : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
-    Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-  } else {
-    Src = Src.getOperand(0);
-  }
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v8i1:
+    SExtVT = MVT::v8i16;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v8i32;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v16i1:
+    SExtVT = MVT::v16i8;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v16i16;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v32i1:
+    SExtVT = MVT::v32i8;
+    UseLASX = true;
+    break;
+  };
+  if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
+    return SDValue();
+  Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+                      : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+  Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
 
   SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
   EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004ed33a..7cefb3f8119b 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
   bool Is64Bit = TT.isArch64Bit();
   ABI TripleABI;
   switch (TT.getEnvironment()) {
+  case llvm::Triple::EnvironmentType::UnknownEnvironment:
+    TripleABI = ABI_Unknown;
+    break;
   case llvm::Triple::EnvironmentType::GNUSF:
   case llvm::Triple::EnvironmentType::MuslSF:
     TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
 
   // 1. If the '-target-abi' is valid, use it.
   if (IsABIValidForFeature(ArgProvidedABI)) {
-    if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+    if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
       errs()
           << "warning: triple-implied ABI conflicts with provided target-abi '"
           << ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
       return Is64Bit ? ABI_LP64F : ABI_ILP32F;
     return Is64Bit ? ABI_LP64S : ABI_ILP32S;
   };
-  if (ABIName.empty())
-    errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
-              "feature-implied ABI\n";
-  else
+  if (!ABIName.empty())
     errs() << "warning: both target-abi and the triple-implied ABI are "
               "invalid, ignoring and using feature-implied ABI\n";
   return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 87e06a6d3c08..2903ff75475c 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -747,14 +747,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
     if (FS.empty() && M.size() && F->hasFnAttribute("target-features"))
       FS = F->getFnAttribute("target-features").getValueAsString();
 
+    std::string strFS = FS.str();
+    if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool())
+      strFS += strFS.empty() ? "+soft-float" : ",+soft-float";
+
     // Compute MIPS architecture attributes based on the default subtarget
     // that we'd have constructed.
     // FIXME: For ifunc related functions we could iterate over and look
     // for a feature string that doesn't match the default one.
     StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
     const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
-    const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM,
-                            std::nullopt);
+    const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(),
+                            MTM, std::nullopt);
 
     bool IsABICalls = STI.isABICalls();
     const MipsABIInfo &ABI = MTM.getABI();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0e581a7a1650..ec6b38215166 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
-
   setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
                        ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
                        ISD::SIGN_EXTEND});
@@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
   case ISD::READCYCLECOUNTER:
     return lowerREADCYCLECOUNTER(Op, DAG);
-  case ISD::ConstantFP:
-    return lowerConstantFP(Op, DAG);
   }
   return SDValue();
 }
@@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
 }
 
-SDValue MipsTargetLowering::lowerConstantFP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getSimpleValueType();
-  SDNode *N = Op.getNode();
-  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N);
-
-  if (!CFP->isNaN() || Subtarget.isNaN2008()) {
-    return SDValue();
-  }
-
-  APFloat NaNValue = CFP->getValueAPF();
-  auto &Sem = NaNValue.getSemantics();
-
-  // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN
-  // encodings, and one for sNaNs. Check every NaN constants and make sure
-  // they are correctly encoded for legacy encodings.
-  if (!NaNValue.isSignaling()) {
-    APFloat RealQNaN = NaNValue.getSNaN(Sem);
-    return DAG.getConstantFP(RealQNaN, DL, VT);
-  }
-  return SDValue();
-}
-
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 31ac5d4c185b..c65c76ccffc7 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -592,7 +592,6 @@ class TargetRegisterClass;
     SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
 
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 459525ed4ee9..67f59ed507f3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7296,9 +7296,17 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
       if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
           ValVT.isInteger() &&
           ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
-        SDValue ArgValueTrunc = DAG.getNode(
-            ISD::TRUNCATE, dl, ArgVT.getSimpleVT() == MVT::i1 ? MVT::i8 : ArgVT,
-            ArgValue);
+        // It is possible to have either real integer values
+        // or integers that were not originally integers.
+        // In the latter case, these could have came from structs,
+        // and these integers would not have an extend on the parameter.
+        // Since these types of integers do not have an extend specified
+        // in the first place, the type of extend that we do should not matter.
+        EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
+                                 ? MVT::i8
+                                 : ArgVT;
+        SDValue ArgValueTrunc =
+            DAG.getNode(ISD::TRUNCATE, dl, TruncatedArgVT, ArgValue);
         SDValue ArgValueExt =
             ArgSignExt ? DAG.getSExtOrTrunc(ArgValueTrunc, dl, ValVT)
                        : DAG.getZExtOrTrunc(ArgValueTrunc, dl, ValVT);
@@ -9586,12 +9594,14 @@ static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
   return false;
 }
 
-bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
+bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
+                     bool IsLittleEndian) {
   assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
 
   BitMask.clearAllBits();
   EVT VT = BVN.getValueType(0);
-  APInt ConstValue(VT.getSizeInBits(), 0);
+  unsigned VTSize = VT.getSizeInBits();
+  APInt ConstValue(VTSize, 0);
 
   unsigned EltWidth = VT.getScalarSizeInBits();
 
@@ -9601,8 +9611,10 @@ bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
 
     if (!CN)
       return false;
-
-    ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+    // The elements in a vector register are ordered in reverse byte order
+    // between little-endian and big-endian modes.
+    ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth),
+                          IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
     BitPos += EltWidth;
   }
 
@@ -9633,7 +9645,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // we do not convert it to MTVSRBMI.
     // The xxleqv instruction sets a vector with all ones.
     // The xxlxor instruction sets a vector with all zeros.
-    if (isValidMtVsrBmi(BitMask, *BVN) && BitMask != 0 && BitMask != 0xffff) {
+    if (isValidMtVsrBmi(BitMask, *BVN, Subtarget.isLittleEndian()) &&
+        BitMask != 0 && BitMask != 0xffff) {
       SDValue SDConstant = DAG.getTargetConstant(BitMask, dl, MVT::i32);
       MachineSDNode *MSDNode =
           DAG.getMachineNode(PPC::MTVSRBMI, dl, MVT::v16i8, SDConstant);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 99ef89a7fdc0..dbbf06f47153 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3252,7 +3252,8 @@ def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
 
 // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
 // This uses two output registers, the first as the real output, the second as a
-// temporary register, used internally in code generation.
+// temporary register, used internally in code generation. A "bl" also clobbers LR.
+let Defs = [LR] in
 def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
                 []>, NoEncode<"$rT">;
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76dca4794e05..f1230407b164 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
     SpillsKnownBit = true;
     break;
   default:
+    // When spilling a CR bit, the super register may not be explicitly defined
+    // (i.e. it can be defined by a CR-logical that only defines the subreg) so
+    // we state that the CR field is undef. Also, in order to preserve the kill
+    // flag on the CR bit, we add it as an implicit use.
+
     // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
     // bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
     // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
     // register), and SETNBC will set this.
     if (Subtarget.isISA3_1()) {
       BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
-          .addReg(SrcReg, RegState::Undef);
+          .addReg(SrcReg, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit |
+                              getKillRegState(MI.getOperand(0).isKill()));
       break;
     }
 
@@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
           SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT ||
           SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) {
         BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg)
-          .addReg(getCRFromCRBit(SrcReg), RegState::Undef);
+            .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit |
+                                getKillRegState(MI.getOperand(0).isKill()));
         break;
       }
     }
 
     // We need to move the CR field that contains the CR bit we are spilling.
-    // The super register may not be explicitly defined (i.e. it can be defined
-    // by a CR-logical that only defines the subreg) so we state that the CR
-    // field is undef. Also, in order to preserve the kill flag on the CR bit,
-    // we add it as an implicit use.
     BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
       .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
       .addReg(SrcReg,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index e42d6c539a34..df3028e05139 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -792,6 +792,23 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   Asm->getWriter().recordRelocation(F, VendorFixup, VendorTarget, VendorValue);
 }
 
+static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
+  // Some Fixups are marked as LinkerRelaxable by
+  // `RISCVMCCodeEmitter::getImmOpValue` only because they may be
+  // (assembly-)relaxed into a linker-relaxable instruction. This function
+  // should return `false` for those fixups so they do not get a `R_RISCV_RELAX`
+  // relocation emitted in addition to the relocation.
+  switch (Kind) {
+  default:
+    break;
+  case RISCV::fixup_riscv_rvc_jump:
+  case RISCV::fixup_riscv_rvc_branch:
+  case RISCV::fixup_riscv_jal:
+    return false;
+  }
+  return true;
+}
+
 bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                const MCValue &Target, uint64_t &FixedValue,
                                bool IsResolved) {
@@ -834,25 +851,32 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     return false;
   }
 
-  // If linker relaxation is enabled and supported by the current relocation,
-  // generate a relocation and then append a RELAX.
-  if (Fixup.isLinkerRelaxable())
+  // If linker relaxation is enabled and supported by the current fixup, then we
+  // always want to generate a relocation.
+  bool NeedsRelax = Fixup.isLinkerRelaxable() &&
+                    relaxableFixupNeedsRelocation(Fixup.getKind());
+  if (NeedsRelax)
     IsResolved = false;
+
   if (IsResolved && Fixup.isPCRel())
     IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
 
   if (!IsResolved) {
-    // Some Fixups require a vendor relocation, record it (directly) before we
+    // Some Fixups require a VENDOR relocation, record it (directly) before we
     // add the relocation.
     maybeAddVendorReloc(F, Fixup);
 
     Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
-  }
 
-  if (Fixup.isLinkerRelaxable()) {
-    auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
-    Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
-                                      FixedValueA);
+    if (NeedsRelax) {
+      // Some Fixups get a RELAX relocation, record it (directly) after we add
+      // the relocation.
+      MCFixup RelaxFixup =
+          MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
+      MCValue RelaxTarget = MCValue::get(nullptr);
+      uint64_t RelaxValue;
+      Asm->getWriter().recordRelocation(F, RelaxFixup, RelaxTarget, RelaxValue);
+    }
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index cbeabdddb937..717fba68b48e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -576,8 +576,21 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
          "getImmOpValue expects only expressions or immediates");
   const MCExpr *Expr = MO.getExpr();
   MCExpr::ExprKind Kind = Expr->getKind();
-  unsigned FixupKind = RISCV::fixup_riscv_invalid;
+
+  // `RelaxCandidate` must be set to `true` in two cases:
+  // - The fixup's relocation gets a R_RISCV_RELAX relocation
+  // - The underlying instruction may be relaxed to an instruction that gets a
+  //   `R_RISCV_RELAX` relocation.
+  //
+  // The actual emission of `R_RISCV_RELAX` will be handled in
+  // `RISCVAsmBackend::applyFixup`.
   bool RelaxCandidate = false;
+  auto AsmRelaxToLinkerRelaxableWithFeature = [&](unsigned Feature) -> void {
+    if (!STI.hasFeature(RISCV::FeatureExactAssembly) && STI.hasFeature(Feature))
+      RelaxCandidate = true;
+  };
+
+  unsigned FixupKind = RISCV::fixup_riscv_invalid;
   if (Kind == MCExpr::Specifier) {
     const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
     FixupKind = RVExpr->getSpecifier();
@@ -644,18 +657,26 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     // FIXME: Sub kind binary exprs have chance of underflow.
     if (MIFrm == RISCVII::InstFormatJ) {
       FixupKind = RISCV::fixup_riscv_jal;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatB) {
       FixupKind = RISCV::fixup_riscv_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCJ) {
       FixupKind = RISCV::fixup_riscv_rvc_jump;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatCB) {
       FixupKind = RISCV::fixup_riscv_rvc_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCI) {
       FixupKind = RISCV::fixup_riscv_rvc_imm;
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
       FixupKind = RISCV::fixup_riscv_qc_e_branch;
+      // This might be assembler relaxed to `qc.e.b<cc>; jal` but we cannot
+      // relax the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatQC_EAI) {
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
@@ -670,9 +691,9 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   assert(FixupKind != RISCV::fixup_riscv_invalid && "Unhandled expression!");
 
   addFixup(Fixups, 0, Expr, FixupKind);
-  // If linker relaxation is enabled and supported by this relocation, set
-  // a bit so that if fixup is unresolved, a R_RISCV_RELAX relocation will be
-  // appended.
+  // If linker relaxation is enabled and supported by this relocation, set a bit
+  // so that the assembler knows the size of the instruction is not fixed/known,
+  // and the relocation will need a R_RISCV_RELAX relocation.
   if (EnableRelax && RelaxCandidate)
     Fixups.back().setLinkerRelaxable();
   ++MCNumFixups;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index a796c910bd44..6c8e3da80b93 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -738,7 +738,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
                                        MachineFunction &MF, uint64_t Offset,
                                        uint64_t RealStackSize, bool EmitCFI,
                                        bool NeedProbe, uint64_t ProbeSize,
-                                       bool DynAllocation) const {
+                                       bool DynAllocation,
+                                       MachineInstr::MIFlag Flag) const {
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -748,7 +749,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
 
     if (EmitCFI)
       CFIBuilder.buildDefCFAOffset(RealStackSize);
@@ -759,7 +760,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
 
     return;
@@ -770,14 +771,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t CurrentOffset = 0;
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-ProbeSize), Flag, getStackAlign());
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
 
       CurrentOffset += ProbeSize;
       if (EmitCFI)
@@ -787,8 +787,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t Residual = Offset - CurrentOffset;
     if (Residual) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-Residual), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-Residual), Flag, getStackAlign());
       if (EmitCFI)
         CFIBuilder.buildDefCFAOffset(Offset);
 
@@ -798,7 +797,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .addReg(RISCV::X0)
             .addReg(SPReg)
             .addImm(0)
-            .setMIFlags(MachineInstr::FrameSetup);
+            .setMIFlags(Flag);
       }
     }
 
@@ -812,8 +811,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   Register TargetReg = RISCV::X6;
   // SUB TargetReg, SP, RoundedSize
   RI->adjustReg(MBB, MBBI, DL, TargetReg, SPReg,
-                StackOffset::getFixed(-RoundedSize), MachineInstr::FrameSetup,
-                getStackAlign());
+                StackOffset::getFixed(-RoundedSize), Flag, getStackAlign());
 
   if (EmitCFI) {
     // Set the CFA register to TargetReg.
@@ -830,14 +828,14 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
   if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
     if (DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
   }
 
@@ -1034,7 +1032,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
       MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
     allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
-                  NeedProbe, ProbeSize, DynAllocation);
+                  NeedProbe, ProbeSize, DynAllocation,
+                  MachineInstr::FrameSetup);
 
   // Save SiFive CLIC CSRs into Stack
   emitSiFiveCLICPreemptibleSaves(MF, MBB, MBBI, DL);
@@ -1082,7 +1081,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
                   getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize, DynAllocation);
+                  ProbeSize, DynAllocation, MachineInstr::FrameSetup);
   }
 
   if (RVVStackSize) {
@@ -1814,7 +1813,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
         bool DynAllocation =
             MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
         allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
-                      /*NeedProbe=*/true, ProbeSize, DynAllocation);
+                      /*NeedProbe=*/true, ProbeSize, DynAllocation,
+                      MachineInstr::NoFlags);
       } else {
         const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
         RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d013755ce58a..6af63a4885f3 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineFunction &MF, uint64_t Offset,
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
-                     uint64_t ProbeSize, bool DynAllocation) const;
+                     uint64_t ProbeSize, bool DynAllocation,
+                     MachineInstr::MIFlag Flag) const;
 
 protected:
   const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 666c76b21e63..186191abe12a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   if (!isShiftedMask_32(C1) || isInt<12>(C1))
     return false;
 
+  // INSBI will clobber the input register in N0. Bail out if we need a copy to
+  // preserve this value.
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
   // If C1 is a shifted mask (but can't be formed as an ORI),
   // use a bitfield insert of -1.
   // Transform (or x, C1)
-  //        -> (qc.insbi x, width, shift)
+  //        -> (qc.insbi x, -1, width, shift)
   const unsigned Leading = llvm::countl_zero((uint32_t)C1);
   const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
   const unsigned Width = 32 - Leading - Trailing;
@@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
 
-  SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT),
+  SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
                    CurDAG->getTargetConstant(Width, DL, VT),
                    CurDAG->getTargetConstant(Trailing, DL, VT)};
   SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
@@ -2936,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
 bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) {
-  if (SelectAddrFrameIndex(Addr, Base, Offset))
-    return true;
+  // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+  // a 9-bit immediate can be folded.
 
   SDLoc DL(Addr);
   MVT VT = Addr.getSimpleValueType();
@@ -2947,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
     if (isUInt<9>(CVal)) {
       Base = Addr.getOperand(0);
 
-      if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+      // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+      // a 9-bit immediate can be folded.
       Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9cbc364afc21..5fb16f5ac6b9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16013,7 +16013,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   uint64_t MulAmt = CNode->getZExtValue();
 
   // Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
-  if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+  if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
   const bool HasShlAdd = Subtarget.hasStdExtZba() ||
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 26bb1e8d1785..f39130090def 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in {
 let Predicates = [HasVendorXqcibm, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">;
-  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
-                             (ins simm5:$imm5, uimm5_plus1:$width,
+  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                             (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width,
                              uimm5:$shamt), "qc.insbi",
                              "$rd, $imm5, $width, $shamt"> {
+    let Constraints = "$rd = $rd_wb";
     bits<5> imm5;
     bits<5> shamt;
     bits<5> width;
@@ -1376,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in {
 def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 4147c97a7a23..92bc3ee8bdac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -1130,13 +1130,13 @@ let Predicates = [HasStdExtZvkned] in {
 
 let Predicates = [HasStdExtZvknha] in {
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>;
-  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>;
+  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>;
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>;
 } // Predicates = [HasStdExtZvknha]
 
 let Predicates = [HasStdExtZvknhb] in {
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>;
-  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>;
+  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32I64IntegerVectors>;
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>;
 } // Predicates = [HasStdExtZvknhb]
 
diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
index 7a2541a652b5..0d37db0138e4 100644
--- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -137,6 +137,11 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
     NextI = next_nodbg(NextI, E);
   DebugLoc DL = I->getDebugLoc();
 
+  // Make a copy so we can update the kill flag in the MoveFromAToS case. The
+  // copied operand needs to be scoped outside the if since we make a pointer
+  // to it.
+  MachineOperand PairedSource = *PairedRegs.Source;
+
   // The order of S-reg depends on which instruction holds A0, instead of
   // the order of register pair.
   // e,g.
@@ -147,8 +152,15 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
   //   mv a1, s1    =>  cm.mva01s s2,s1
   bool StartWithX10 = ARegInFirstPair == RISCV::X10;
   if (isMoveFromAToS(Opcode)) {
-    Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
-    Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+    // We are moving one of the copies earlier so its kill flag may become
+    // invalid. Clear the copied kill flag if there are any reads of the
+    // register between the new location and the old location.
+    for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+      if (It->readsRegister(PairedSource.getReg(), TRI))
+        PairedSource.setIsKill(false);
+
+    Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+    Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
   } else {
     Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
     Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef53985484..c1cc19b503de 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
     return false;
 
+  // Masked off lanes past TrueVL will come from False, and converting to vmv
+  // will lose these lanes unless MIVL <= TrueVL.
+  // TODO: We could relax this for False == Passthru and True policy == TU
+  const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  const MachineOperand &TrueVL =
+      True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+  if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+    return false;
+
   // True's passthru needs to be equivalent to False
   Register TruePassthruReg = True->getOperand(1).getReg();
   Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d87c267..f81fdf0ccc82 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1799,12 +1799,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f64,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
-  setOperationAction(ISD::FMA, MVT::f32, Expand);
+  setOperationAction(ISD::FMA, MVT::f32,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -2201,7 +2203,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InGlue;
 
-    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
     InGlue = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2221,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                      InGlue};
     Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InGlue = Chain.getValue(1);
-    Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
     InGlue = Chain.getValue(1);
     SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
 
@@ -3550,6 +3552,11 @@ bool SparcTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return isCheapToSpeculateCtlz(Ty);
 }
 
+bool SparcTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                     EVT VT) const {
+  return Subtarget->isUA2007() && !Subtarget->useSoftFloat();
+}
+
 // Override to disable global variable loading on Linux.
 void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
   if (!Subtarget->isTargetLinux())
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 0d220f8c3d32..4017beb88ff3 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -177,6 +177,11 @@ namespace llvm {
 
     bool isCheapToSpeculateCttz(Type *Ty) const override;
 
+    bool enableAggressiveFMAFusion(EVT VT) const override { return true; };
+
+    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                    EVT VT) const override;
+
     bool shouldInsertFencesForAtomic(const Instruction *I) const override {
       // FIXME: We insert fences for each atomics and generate
       // sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
index 3a30e552e6db..ffd4423137e3 100644
--- a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
+++ b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
@@ -66,3 +66,15 @@ defm CXBCOND : F2_56<"cxb", 1>;
 def FPMADDX   : FourOp<"fpmaddx", 0b110111, 0b0000, DFPRegs>;
 def FPMADDXHI : FourOp<"fpmaddxhi", 0b110111, 0b0100, DFPRegs>;
 } // Predicates = [HasOSA2011]
+
+// UA2007 instruction patterns.
+let Predicates = [HasUA2007] in {
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, f32:$add)), (FMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, f64:$add)), (FMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub))), (FMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub))), (FMSUBD $rs1, $rs2, $sub)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, f32:$add))), (FNMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, f64:$add))), (FNMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub)))), (FNMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub)))), (FNMSUBD $rs1, $rs2, $sub)>;
+} // Predicates = [HasUA2007]
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd2bdea..2611c291abaa 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess(
              C2.ScaleCost, C2.SetupCost);
 }
 
-bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
-                                         const Function *Callee) const {
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  // Support only equal feature bitsets. Restriction should be relaxed in the
-  // future to allow inlining when callee's bits are subset of the caller's.
-  return CallerBits == CalleeBits;
-}
-
 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   bool Vector = (ClassID == 1);
   if (!Vector)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e8af00..fc681dec1859 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -65,9 +65,6 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) const override;
 
-  bool areInlineCompatible(const Function *Caller,
-                           const Function *Callee) const override;
-
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 2662241ef849..e6486e247209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
   // Precompute the set of registers that are unused, so that we can insert
   // drops to their defs.
+  // And unstackify any stackified registers that don't have any uses, so that
+  // they can be dropped later. This can happen when transformations after
+  // RegStackify remove instructions using stackified registers.
   BitVector UseEmpty(MRI.getNumVirtRegs());
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
-    UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.use_empty(Reg)) {
+      UseEmpty[I] = true;
+      MFI.unstackifyVReg(Reg);
+    }
+  }
 
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1cfe383..8213e512f45e 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ class X86AsmParser : public MCTargetAsmParser {
       }
       PrevState = CurrState;
     }
-    void onRParen() {
-      PrevState = State;
+    bool onRParen(StringRef &ErrMsg) {
+      IntelExprState CurrState = State;
       switch (State) {
       default:
         State = IES_ERROR;
@@ -1054,9 +1054,27 @@ class X86AsmParser : public MCTargetAsmParser {
       case IES_RBRAC:
       case IES_RPAREN:
         State = IES_RPAREN;
+        // In the case of a multiply, onRegister has already set IndexReg
+        // directly, with appropriate scale.
+        // Otherwise if we just saw a register it has only been stored in
+        // TmpReg, so we need to store it into the state machine.
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // no explicit scale.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            if (IndexReg)
+              return regsUseUpError(ErrMsg);
+            IndexReg = TmpReg;
+            Scale = 0;
+          }
+        }
         IC.pushOperator(IC_RPAREN);
         break;
       }
+      PrevState = CurrState;
+      return false;
     }
     bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
                   const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       }
       break;
     case AsmToken::LParen:  SM.onLParen(); break;
-    case AsmToken::RParen:  SM.onRParen(); break;
+    case AsmToken::RParen:
+      if (SM.onRParen(ErrMsg)) {
+        return Error(Tok.getLoc(), ErrMsg);
+      }
+      break;
     }
     if (SM.hadError())
       return Error(Tok.getLoc(), "unknown token in expression");
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f7a81f..5d5a70589324 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                 CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(ValVT == MVT::i32 && "Should have i32 parts");
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  assert(PendingMembers.size() == 4 && "Should have four parts");
+
+  int64_t Offset = State.AllocateStack(16, Align(16));
+  PendingMembers[0].convertToMem(Offset);
+  PendingMembers[1].convertToMem(Offset + 4);
+  PendingMembers[2].convertToMem(Offset + 8);
+  PendingMembers[3].convertToMem(Offset + 12);
+
+  State.addLoc(PendingMembers[0]);
+  State.addLoc(PendingMembers[1]);
+  State.addLoc(PendingMembers[2]);
+  State.addLoc(PendingMembers[3]);
+  PendingMembers.clear();
+  return true;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa0226..f020e0b55141 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
 
+  // i128 and fp128 need to be passed on the stack with a higher alignment than
+  // their legal types. Handle this with a custom function.
+  CCIfType<[i32],
+           CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
   // On swifttailcc pass swiftself in ECX.
   CCIfCC<"CallingConv::SwiftTail",
          CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d66..a548170e654a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4997,9 +4997,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
 
   EVT VT = Op.getValueType();
   unsigned SizeInBits = VT.getSizeInBits();
-  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
+  // Can't split constant.
+  if ((SizeInBits % EltSizeInBits) != 0)
+    return false;
+
   // Bitcast a source array of element bits to the target size.
   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
@@ -15400,18 +15403,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
         return SDValue();
     }
 
-    // Avoid returning the same shuffle operation. For example,
-    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
-    //                             undef:v16i16
-    if (CrossLaneMask == Mask || InLaneMask == Mask)
-      return SDValue();
-
     // Simplify CrossLaneMask based on the actual demanded elements.
     if (V1.hasOneUse())
       for (int i = 0; i != NumElts; ++i)
         if (!DemandedCrossLane[i])
           CrossLaneMask[i] = SM_SentinelUndef;
 
+    // Avoid returning the same shuffle operation. For example,
+    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+    //                             undef:v16i16
+    if (CrossLaneMask == Mask || InLaneMask == Mask)
+      return SDValue();
+
     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
                                 InLaneMask);
@@ -44175,8 +44178,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
       // Conversions.
       // TODO: Add more CVT opcodes when we have test coverage.
-    case X86ISD::CVTTP2SI:
     case X86ISD::CVTTP2UI: {
+      if (!Subtarget.hasVLX())
+        break;
+      [[fallthrough]];
+    }
+    case X86ISD::CVTTP2SI: {
       if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
           !Subtarget.hasVLX())
         break;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad355311527..b4639ac2577e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
-  // i128 split into i64 needs to be allocated to two consecutive registers,
-  // or spilled to the stack as a whole.
-  return Ty->isIntegerTy(128);
+  // On x86-64 i128 is split into two i64s and needs to be allocated to two
+  // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+  // is split to four i32s and never actually passed in registers, but we use
+  // the consecutive register mark to match it in TableGen.
+  if (Ty->isIntegerTy(128))
+    return true;
+
+  // On x86-32, fp128 acts the same as i128.
+  if (Subtarget.is32Bit() && Ty->isFP128Ty())
+    return true;
+
+  return false;
 }
 
 /// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 85dd9a1bf716..0f63ed0166cf 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2079,6 +2079,7 @@ struct DSEState {
       AllocFnKind AllocKind =
           Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() |
           AllocFnKind::Zeroed;
+      AllocKind &= ~AllocFnKind::Uninitialized;
       Attrs =
           Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind))
               .removeFnAttribute(Ctx, "alloc-variant-zeroed");
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 221094f170ac..b9546c5fa236 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -128,6 +128,8 @@ class ConstantTerminatorFoldingImpl {
   // from any other block. So this variable set to true means that loop's latch
   // has become unreachable from loop header.
   bool DeleteCurrentLoop = false;
+  // Whether or not we enter the loop through an indirectbr.
+  bool HasIndirectEntry = false;
 
   // The blocks of the original loop that will still be reachable from entry
   // after the constant folding.
@@ -216,6 +218,19 @@ class ConstantTerminatorFoldingImpl {
       return;
     }
 
+    // We need a loop preheader to split in handleDeadExits(). If LoopSimplify
+    // wasn't able to form one because the loop can be entered through an
+    // indirectbr we cannot continue.
+    if (!L.getLoopPreheader()) {
+      assert(any_of(predecessors(L.getHeader()),
+                    [&](BasicBlock *Pred) {
+                      return isa<IndirectBrInst>(Pred->getTerminator());
+                    }) &&
+             "Loop should have preheader if it is not entered indirectly");
+      HasIndirectEntry = true;
+      return;
+    }
+
     // Collect live and dead loop blocks and exits.
     LiveLoopBlocks.insert(L.getHeader());
     for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
@@ -546,6 +561,12 @@ class ConstantTerminatorFoldingImpl {
       return false;
     }
 
+    if (HasIndirectEntry) {
+      LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not"
+                           " supported!\n");
+      return false;
+    }
+
     // Nothing to constant-fold.
     if (FoldCandidates.empty()) {
       LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 70b4552190a4..f537b0d3fbd7 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1258,8 +1258,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
            "Map index doesn't point back to a slice with this user.");
   }
 
-  // Disable SRoA for any intrinsics except for lifetime invariants and
-  // invariant group.
+  // Disable SRoA for any intrinsics except for lifetime invariants.
   // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
   void visitIntrinsicInst(IntrinsicInst &II) {
@@ -1279,12 +1278,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return;
     }
 
-    if (II.isLaunderOrStripInvariantGroup()) {
-      insertUse(II, Offset, AllocSize, true);
-      enqueueUsers(II);
-      return;
-    }
-
     Base::visitIntrinsicInst(II);
   }
 
@@ -3618,8 +3611,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
   }
 
   bool visitIntrinsicInst(IntrinsicInst &II) {
-    assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() ||
-            II.isDroppable()) &&
+    assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
            "Unexpected intrinsic!");
     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
 
@@ -3633,9 +3625,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       return true;
     }
 
-    if (II.isLaunderOrStripInvariantGroup())
-      return true;
-
     assert(II.getArgOperand(1) == OldPtr);
     // Lifetime intrinsics are only promotable if they cover the whole alloca.
     // Therefore, we drop lifetime intrinsics which don't cover the whole
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index ddd203f3acf7..42b1fdf17f38 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -111,15 +111,14 @@ BasicBlock *
 llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
                              const CriticalEdgeSplittingOptions &Options,
                              const Twine &BBName) {
-  assert(!isa<IndirectBrInst>(TI) &&
-         "Cannot split critical edge from IndirectBrInst");
-
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
-  // Splitting the critical edge to a pad block is non-trivial. Don't do
-  // it in this generic function.
-  if (DestBB->isEHPad()) return nullptr;
+  // Splitting the critical edge to a pad block is non-trivial.
+  // And we cannot split block with IndirectBr as a terminator.
+  // Don't do it in this generic function.
+  if (DestBB->isEHPad() || isa<IndirectBrInst>(TI))
+    return nullptr;
 
   if (Options.IgnoreUnreachableDests &&
       isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 200d1fb85415..e7623aaff105 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   case RecurKind::UMin:
     return Intrinsic::vector_reduce_umin;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::vector_reduce_fmax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::vector_reduce_fmin;
   case RecurKind::FMaximum:
     return Intrinsic::vector_reduce_fmaximum;
@@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
   case RecurKind::SMax:
     return Intrinsic::smax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::minnum;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::maxnum;
   case RecurKind::FMinimum:
     return Intrinsic::minimum;
@@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
   if (Ty->isIntOrIntVectorTy() ||
-      (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+      (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
+       RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
        RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
-    // TODO: Add float minnum/maxnum support when FMF nnan is set.
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
     return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
                                    "rdx.minmax");
@@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   case RecurKind::UMin:
   case RecurKind::FMax:
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
+  case RecurKind::FMaxNum:
   case RecurKind::FMinimum:
   case RecurKind::FMaximum:
   case RecurKind::FMinimumNum:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 11853859484e..f57ce0c3ccb4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -230,7 +230,6 @@ class VPBuilder {
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
   /// and \p B.
-  /// TODO: add createFCmp when needed.
   VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
                             DebugLoc DL = DebugLoc::getUnknown(),
                             const Twine &Name = "") {
@@ -240,6 +239,17 @@ class VPBuilder {
         new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
   }
 
+  /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
+  /// and \p B.
+  VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+                            DebugLoc DL = DebugLoc::getUnknown(),
+                            const Twine &Name = "") {
+    assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
+           Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
+    return tryInsertInstruction(
+        new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+  }
+
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 06db89a89bc3..5cf4b1651538 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4345,10 +4345,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     ElementCount VF) const {
-  // Cross iteration phis such as reductions need special handling and are
-  // currently unsupported.
-  if (any_of(OrigLoop->getHeader()->phis(),
-             [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
+  // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
+  // reductions need special handling and are currently unsupported.
+  if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
+        if (!Legal->isReductionVariable(&Phi))
+          return Legal->isFixedOrderRecurrence(&Phi);
+        RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
+        return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+      }))
     return false;
 
   // Phis with uses outside of the loop require special handling and are
@@ -8817,6 +8821,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
 
+  // Apply mandatory transformation to handle FP maxnum/minnum reduction with
+  // NaNs if possible, bail out otherwise.
+  if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
+                                *Plan))
+    return nullptr;
+
   // Transform recipes to abstract recipes if it is legal and beneficial and
   // clamp the range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 31aec77db63c..f6610ea5b333 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23196,6 +23196,8 @@ class HorizontalReduction {
         case RecurKind::FindFirstIVUMin:
         case RecurKind::FindLastIVSMax:
         case RecurKind::FindLastIVUMax:
+        case RecurKind::FMaxNum:
+        case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
         case RecurKind::FMinimumNum:
         case RecurKind::None:
@@ -23333,6 +23335,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
@@ -23435,6 +23439,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b27a7ffeed20..66657b98b094 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return ResTy;
   }
   case Instruction::ICmp:
+  case Instruction::FCmp:
   case VPInstruction::ActiveLaneMask:
     assert(inferScalarType(R->getOperand(0)) ==
                inferScalarType(R->getOperand(1)) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 52eecb000d0c..a7a22e042aef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -628,3 +628,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
+
+bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
+  auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
+    auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
+        RedPhiR->getBackedgeValue()->getDefiningRecipe());
+    if (!MinMaxR)
+      return nullptr;
+
+    auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
+    if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+        !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
+      return nullptr;
+
+#ifndef NDEBUG
+    Intrinsic::ID RdxIntrinsicId =
+        RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
+                                                           : Intrinsic::minnum;
+    assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+            cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
+                RdxIntrinsicId) ||
+           (RepR &&
+            cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
+                RdxIntrinsicId) &&
+               "Intrinsic did not match recurrence kind");
+#endif
+
+    if (MinMaxR->getOperand(0) == RedPhiR)
+      return MinMaxR->getOperand(1);
+
+    assert(MinMaxR->getOperand(1) == RedPhiR &&
+           "Reduction phi operand expected");
+    return MinMaxR->getOperand(0);
+  };
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPReductionPHIRecipe *RedPhiR = nullptr;
+  bool HasUnsupportedPhi = false;
+  for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+    if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
+      continue;
+    auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
+    if (!Cur) {
+      // TODO: Also support fixed-order recurrence phis.
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    // For now, only a single reduction is supported.
+    // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
+    if (RedPhiR)
+      return false;
+    if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
+        Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    RedPhiR = Cur;
+  }
+
+  if (!RedPhiR)
+    return true;
+
+  // We won't be able to resume execution in the scalar tail, if there are
+  // unsupported header phis or there is no scalar tail at all, due to
+  // tail-folding.
+  if (HasUnsupportedPhi || !Plan.hasScalarTail())
+    return false;
+
+  VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
+  if (!MinMaxOp)
+    return false;
+
+  RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
+  assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+         "unsupported reduction");
+
+  /// Check if the vector loop of \p Plan can early exit and restart
+  /// execution of last vector iteration in the scalar loop. This requires all
+  /// recipes up to early exit point be side-effect free as they are
+  /// re-executed. Currently we check that the loop is free of any recipe that
+  /// may write to memory. Expected to operate on an early VPlan w/o nested
+  /// regions.
+  for (VPBlockBase *VPB : vp_depth_first_shallow(
+           Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+    auto *VPBB = cast<VPBasicBlock>(VPB);
+    for (auto &R : *VPBB) {
+      if (R.mayWriteToMemory() &&
+          !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+        return false;
+    }
+  }
+
+  VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+         "Unexpected terminator");
+  auto *IsLatchExitTaken =
+      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+                         LatchExitingBranch->getOperand(1));
+
+  VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
+  VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
+  auto *AnyExitTaken =
+      Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
+  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+  LatchExitingBranch->eraseFromParent();
+
+  // If we exit early due to NaNs, compute the final reduction result based on
+  // the reduction phi at the beginning of the last vector iteration.
+  auto *RdxResult = find_singleton<VPSingleDefRecipe>(
+      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+          return VPI;
+        return nullptr;
+      });
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
+  auto *NewSel =
+      Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
+  RdxResult->setOperand(1, NewSel);
+
+  auto *ScalarPH = Plan.getScalarPreheader();
+  // Update resume phis for inductions in the scalar preheader. If AnyNaN is
+  // true, the resume from the start of the last vector iteration via the
+  // canonical IV, otherwise from the original value.
+  for (auto &R : ScalarPH->phis()) {
+    auto *ResumeR = cast<VPPhi>(&R);
+    VPValue *VecV = ResumeR->getOperand(0);
+    if (VecV == RdxResult)
+      continue;
+    if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
+      if (DerivedIV->getNumUsers() == 1 &&
+          DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
+        auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
+                                            &Plan.getVectorTripCount());
+        DerivedIV->moveAfter(&*Builder.getInsertPoint());
+        DerivedIV->setOperand(1, NewSel);
+        continue;
+      }
+    }
+    // Bail out and abandon the current, partially modified, VPlan if we
+    // encounter resume phi that cannot be updated yet.
+    if (VecV != &Plan.getVectorTripCount()) {
+      LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
+                           "FMaxNum/FMinNum reduction.\n");
+      return false;
+    }
+    auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+    ResumeR->setOperand(0, NewSel);
+  }
+
+  auto *MiddleTerm = MiddleVPBB->getTerminator();
+  Builder.setInsertPoint(MiddleTerm);
+  VPValue *MiddleCond = MiddleTerm->getOperand(0);
+  VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
+  MiddleTerm->setOperand(0, NewCond);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1664bcc3881a..57b713d3dfcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
     return Builder.CreateFreeze(Op, Name);
   }
+  case Instruction::FCmp:
   case Instruction::ICmp: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Res = State.get(getOperand(0));
     for (VPValue *Op : drop_begin(operands()))
       Res = Builder.CreateOr(Res, State.get(Op));
-    return Builder.CreateOrReduce(Res);
+    return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::FirstActiveLane: {
     if (getNumOperands() == 1) {
@@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   switch (getOpcode()) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case VPInstruction::AnyOf:
@@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return Op == getOperand(1);
   case Instruction::PHI:
     return true;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case Instruction::Or:
@@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
   switch (getOpcode()) {
   default:
     return false;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
     return vputils::onlyFirstPartUsed(this);
@@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
     return Opcode == Instruction::ZExt;
     break;
   case OperationType::Cmp:
-    return Opcode == Instruction::ICmp;
+    return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
   case OperationType::Other:
     return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 870b1bb68b79..c3fb359d1429 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,6 +99,12 @@ struct VPlanTransforms {
   /// not valid.
   static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
 
+  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
+  /// try to update the vector loop to exit early if any input is NaN and resume
+  /// executing in the scalar loop to handle the NaNs there. Return false if
+  /// this attempt was unsuccessful.
+  static bool handleMaxMinNumReductions(VPlan &Plan);
+
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   static void clearReductionWrapFlags(VPlan &Plan);
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c43dfd..639f8686a271 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -74,7 +74,7 @@ class VectorCombine {
                 const DataLayout *DL, TTI::TargetCostKind CostKind,
                 bool TryEarlyFoldsOnly)
       : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
-        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind),
+        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
         TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
 
   bool run();
@@ -88,6 +88,7 @@ class VectorCombine {
   AssumptionCache &AC;
   const DataLayout *DL;
   TTI::TargetCostKind CostKind;
+  const SimplifyQuery SQ;
 
   /// If true, only perform beneficial early IR transforms. Do not introduce new
   /// vector operations.
@@ -1185,17 +1186,18 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   // Fold the vector constants in the original vectors into a new base vector to
   // get more accurate cost modelling.
   Value *NewVecC = nullptr;
-  TargetFolder Folder(*DL);
   if (CI)
-    NewVecC = Folder.FoldCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
+    NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
   else if (UO)
     NewVecC =
-        Folder.FoldUnOpFMF(UO->getOpcode(), VecCs[0], UO->getFastMathFlags());
+        simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
   else if (BO)
-    NewVecC = Folder.FoldBinOp(BO->getOpcode(), VecCs[0], VecCs[1]);
-  else if (II->arg_size() == 2)
-    NewVecC = Folder.FoldBinaryIntrinsic(II->getIntrinsicID(), VecCs[0],
-                                         VecCs[1], II->getType(), &I);
+    NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
+  else if (II)
+    NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
+
+  if (!NewVecC)
+    return false;
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
@@ -1203,6 +1205,7 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   InstructionCost NewCost =
       ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
                                             CostKind, *Index, NewVecC);
+
   for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
     if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
                               II->getIntrinsicID(), Idx, &TTI)))
@@ -1247,15 +1250,6 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
     ScalarInst->copyIRFlags(&I);
 
-  // Create a new base vector if the constant folding failed.
-  if (!NewVecC) {
-    if (CI)
-      NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
-    else if (UO || BO)
-      NewVecC = Builder.CreateNAryOp(Opcode, VecCs);
-    else
-      NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), VecCs);
-  }
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
   replaceValue(I, *Insert);
   return true;
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index b484f8f6c60b..21e0356fd732 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 519b8ecf6dc7..27dd42297bfa 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index 2a8609d2f418..826605450a2d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 41c272291d7c..4579acb9b355 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -93,36 +93,36 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -369,7 +369,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -408,10 +408,10 @@ define void @vst3(ptr %p) {
 ; CHECK-LABEL: 'vst3'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found costs of 24 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
@@ -452,10 +452,10 @@ define void @vst4(ptr %p) {
 ; CHECK-LABEL: 'vst4'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 09f116f01ec7..4a003a0085c2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v8i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -14,7 +14,7 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 
 define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v16i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:30 Lat:60 SizeLat:60 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -32,7 +32,7 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'sel_v8i16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ec84c58bf968..fa889cc12dc4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -44,7 +44,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -56,7 +56,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -101,7 +101,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -113,7 +113,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -1364,34 +1364,34 @@ define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
new file mode 100644
index 000000000000..381904f77660
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s
+
+target triple = "arm64-apple-macosx13.0.0"
+
+@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)]
+
+define void @test_interp(ptr %frame, ptr %dst) {
+; CHECK-LABEL: test_interp:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
+; CHECK-NEXT:    .cfi_offset w21, -40
+; CHECK-NEXT:    .cfi_offset w22, -48
+; CHECK-NEXT:    .cfi_offset w23, -56
+; CHECK-NEXT:    .cfi_offset w24, -64
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x21, _opcode.targets@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    add x21, x21, _opcode.targets@PAGEOFF
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    add x8, x21, xzr, lsl #3
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    mov w22, #1 ; =0x1
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp0: ; Block address taken
+; CHECK-NEXT:  LBB0_1: ; %loop.header
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp1: ; Block address taken
+; CHECK-NEXT:  LBB0_2: ; %op1.bb
+; CHECK-NEXT:    str xzr, [x19]
+; CHECK-NEXT:  Ltmp2: ; Block address taken
+; CHECK-NEXT:  LBB0_3: ; %op6.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr x0, [x20, #-8]!
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    str x22, [x0]
+; CHECK-NEXT:    ldr x8, [x8, #48]
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp3: ; Block address taken
+; CHECK-NEXT:  LBB0_4: ; %op2.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp4: ; Block address taken
+; CHECK-NEXT:  LBB0_5: ; %op4.bb
+; CHECK-NEXT:  Ltmp5: ; Block address taken
+; CHECK-NEXT:  LBB0_6: ; %op5.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    ldur x8, [x23, #12]
+; CHECK-NEXT:    ldur x9, [x20, #-8]
+; CHECK-NEXT:    add x23, x23, #20
+; CHECK-NEXT:    stp x8, x9, [x20, #-8]
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    add x20, x20, #8
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ]
+  %stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ]
+  %next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ]
+  %iv.next = add i64 %iv, 1
+  %next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv
+  indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb]
+
+op1.bb:
+  store ptr null, ptr %dst, align 8
+  %stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.0 = load ptr, ptr %stack.8, align 8
+  store i64 1, ptr %l.0, align 8
+  %gep.0 = getelementptr i8, ptr %l.0, i64 8
+  %l.1 = load ptr, ptr %gep.0, align 8
+  %gep.1 = getelementptr i8, ptr %l.1, i64 48
+  %l.2 = load ptr, ptr %gep.1, align 8
+  tail call void %l.2(ptr nonnull %l.0)
+  br label %loop.header
+
+op2.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  br label %loop.header
+
+op4.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.20 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.3 = load ptr, ptr %stack.2, align 8
+  %next.instr.12 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.12.val = load ptr, ptr %next.instr.12, align 2
+  store ptr %next.instr.12.val, ptr %stack.2, align 8
+  store ptr %l.3, ptr %stack.pointer, align 8
+  %stack.next = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op5.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.21 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.4 = load ptr, ptr %stack.3, align 8
+  %next.instr.2 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.2.val = load ptr, ptr %next.instr.2, align 2
+  store ptr %next.instr.2.val, ptr %stack.3, align 8
+  store ptr %l.4, ptr %stack.pointer, align 8
+  %stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op6.bb:
+  %stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.5 = load ptr, ptr %stack.4, align 8
+  store i64 1, ptr %l.5, align 8
+  %gep.5 = getelementptr i8, ptr %l.5, i64 8
+  %l.6 = load ptr, ptr %gep.5, align 8
+  %gep.6 = getelementptr i8, ptr %l.6, i64 48
+  %l.7 = load ptr, ptr %gep.6, align 8
+  tail call void %l.7(ptr nonnull %l.5)
+  br label %loop.header
+}
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 83c9b73c5757..2b16dd0f29ec 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
@@ -14,13 +14,14 @@
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
   attributes #1 = { nounwind "target-features"="+sve2p1" }
+  attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
 
 ...
 ---
@@ -318,10 +319,10 @@ registers:
   - { id: 0, class: zpr2 }
 stack:
 liveins:
-  - { reg: '$z0_z1', virtual-reg: '%0' }
+  - { reg: '$z1_z2', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1
+    liveins: $z1_z2
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
     ; CHECK: stack:
@@ -329,12 +330,12 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
 
-    %0:zpr2 = COPY $z0_z1
+    %0:zpr2 = COPY $z1_z2
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -345,7 +346,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1 = COPY %0
+    $z1_z2 = COPY %0
     RET_ReallyLR
 ...
 ---
@@ -439,10 +440,10 @@ registers:
   - { id: 0, class: zpr4 }
 stack:
 liveins:
-  - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+  - { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1_z2_z3
+    liveins: $z1_z2_z3_z4
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
     ; CHECK: stack:
@@ -450,16 +451,16 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: STR_ZXI $z3, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
-    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: STR_ZXI $z3, $sp, 2
+    ; EXPAND: STR_ZXI $z4, $sp, 3
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
+    ; EXPAND: $z3 = LDR_ZXI $sp, 2
+    ; EXPAND: $z4 = LDR_ZXI $sp, 3
 
-    %0:zpr4 = COPY $z0_z1_z2_z3
+    %0:zpr4 = COPY $z1_z2_z3_z4
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -470,7 +471,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1_z2_z3 = COPY %0
+    $z1_z2_z3_z4 = COPY %0
     RET_ReallyLR
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index ae70f91a4ec6..a1d615c91079 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -12,7 +12,7 @@ body:             |
   bb.0:
     liveins: $p0, $z0
 
-    ; CHECK: add_x
+    ; CHECK: name: add_x
     ; CHECK-NOT: MOVPRFX
     ; CHECK: $z0 = FADD_ZPmZ_S renamable $p0, killed $z0, renamable $z0
     ; CHECK-NEXT: RET
@@ -21,22 +21,36 @@ body:             |
 
 ...
 
-# CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
 ---
 name: expand_mls_to_msb
 body:             |
   bb.0:
+    ; CHECK: name: expand_mls_to_msb
+    ; CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
 
-# CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
 ---
 name: expand_mla_to_mad
 body:             |
   bb.0:
+    ; CHECK: name: expand_mla_to_mad
+    ; CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
+
+---
+name: expand_transfer_implicit_defs
+body:             |
+  bb.0:
+    ; CHECK: name: expand_transfer_implicit_defs
+    ; CHECK:      BUNDLE
+    ; CHECK-SAME: implicit-def $z0_z1_z2_z3
+    liveins: $z1, $z2, $p0
+    renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3
+    RET_ReallyLR implicit $z0_z1_z2_z3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f8e13fcdd227..51398a45055e 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1043,10 +1035,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2664,28 +2656,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -2696,7 +2681,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3232,10 +3216,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ba9dd8f7c246..6512bee36e88 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1081,10 +1073,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -1897,28 +1889,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -1929,7 +1914,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2465,10 +2449,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AVR/cmp.ll b/llvm/test/CodeGen/AVR/cmp.ll
index efc9b8da45ba..c932bda1807f 100644
--- a/llvm/test/CodeGen/AVR/cmp.ll
+++ b/llvm/test/CodeGen/AVR/cmp.ll
@@ -298,3 +298,18 @@ define i16 @cmp_i16_gt_1023(i16 %0) {
   %3 = zext i1 %2 to i16
   ret i16 %3
 }
+
+define void @cmp_issue152097(i16 %a) addrspace(1) {
+; See: https://github.com/llvm/llvm-project/issues/152097
+; CHECK-LABEL: cmp_issue152097
+; CHECK:      ldi r18, -1
+; CHECK-NEXT: cpi r24, -2
+; CHECK-NEXT: cpc r25, r18
+; CHECK-NEXT: ret
+  %cmp = icmp ugt i16 -2, %a
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  ret void
+if.else:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll b/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
index b23366bc11ac..f5430dfea586 100644
--- a/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
+++ b/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
@@ -1,20 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -march=hexagon -verify-machineinstrs  < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -verify-machineinstrs  < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, 3'RD STRING\00", align 1
 @.str1 = private unnamed_addr constant [3 x i8] c"%s\00", align 1
 
-; Function Attrs: nounwind
 declare i32 @printf(i8* nocapture readonly, ...)
 
 ; Function Attrs: nounwind
-define i32 @main() {
+define i32 @main() nounwind {
 ; CHECK-LABEL: main:
-; CHECK:         .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    .cfi_def_cfa r30, 8
-; CHECK-NEXT:    .cfi_offset r31, -4
-; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = ##.L.str1
 ; CHECK-NEXT:     r3:2 = CONST64(#2325073635944967245)
@@ -53,5 +48,4 @@ entry:
   ret i32 0
 }
 
-; Function Attrs: nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
diff --git a/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
new file mode 100644
index 000000000000..ff50f1abe589
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: r{{[0-9]+}} = asr(r{{[0-9]+}},#{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}}:{{[0-9]+}} = mpyu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}} += mpyi(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = mpy(r{{[0-9]+}},r{{[0-9]+}})
+
+; ModuleID = '39544.c'
+source_filename = "39544.c"
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @mul_n(i64* nocapture %p, i32* nocapture readonly %a, i32 %k, i32 %n) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %arrayidx.phi = phi i32* [ %a, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ]
+  %arrayidx2.phi = phi i64* [ %p, %for.body.lr.ph ], [ %arrayidx2.inc, %for.body ]
+  %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %conv = sext i32 %0 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  store i64 %mul, i64* %arrayidx2.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx2.inc = getelementptr i64, i64* %arrayidx2.phi, i32 1
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
new file mode 100644
index 000000000000..2960343564fc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
@@ -0,0 +1,50 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner %s -o /dev/null
+
+# Check that edges that violate topological order are not added to the
+# SwingSchedulerDAG. This is a case where the crash was caused by PR 145878.
+
+--- |
+  target triple = "hexagon"
+  
+  define void @crash_145878() {
+  entry:
+    br label %loop
+  
+  loop:                                             ; preds = %loop, %entry
+    %lsr.iv2 = phi i32 [ %lsr.iv.next, %loop ], [ 1, %entry ]
+    %lsr.iv = phi ptr [ %cgep3, %loop ], [ inttoptr (i32 -8 to ptr), %entry ]
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 12
+    %load = load i32, ptr %cgep, align 4
+    store i32 %load, ptr %lsr.iv, align 4
+    %lsr.iv.next = add nsw i32 %lsr.iv2, -1
+    %iv.cmp.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep3 = getelementptr i8, ptr %lsr.iv, i32 -8
+    br i1 %iv.cmp.not, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop
+    ret void
+  }
+...
+---
+name:            crash_145878
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+  
+    %5:intregs = A2_tfrsi -8
+    J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.1.loop (machine-block-address-taken):
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  
+    %1:intregs = PHI %5, %bb.0, %3, %bb.1
+    %6:intregs = L2_loadri_io %1, 12 :: (load (s32) from %ir.cgep)
+    S2_storeri_io %1, 0, killed %6 :: (store (s32) into %ir.lsr.iv)
+    %3:intregs = A2_addi %1, -8
+    ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+  
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
index d07e2914c753..f7653af1fa9b 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@@ -122,23 +122,23 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 define i64 @caller_large_scalars() nounwind {
 ; CHECK-LABEL: caller_large_scalars:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $zero, $sp, 24
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $zero, $sp, 40
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 8
+; CHECK-NEXT:    vst $vr0, $sp, 24
 ; CHECK-NEXT:    ori $a0, $zero, 2
-; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 56
-; CHECK-NEXT:    vst $vr0, $sp, 40
+; CHECK-NEXT:    st.d $a0, $sp, 16
+; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    vst $vr0, $sp, 56
 ; CHECK-NEXT:    ori $a2, $zero, 1
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    addi.d $a1, $sp, 0
-; CHECK-NEXT:    st.d $a2, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 48
+; CHECK-NEXT:    addi.d $a1, $sp, 16
+; CHECK-NEXT:    st.d $a2, $sp, 48
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars(i256 1, i256 2)
   ret i64 %1
@@ -177,20 +177,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-LABEL: caller_large_scalars_exhausted_regs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $a0, $sp, 16
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    st.d $a0, $sp, 8
 ; CHECK-NEXT:    ori $a0, $zero, 9
 ; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 40
+; CHECK-NEXT:    st.d $zero, $sp, 56
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 24
+; CHECK-NEXT:    vst $vr0, $sp, 40
 ; CHECK-NEXT:    ori $a0, $zero, 10
-; CHECK-NEXT:    st.d $a0, $sp, 16
-; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    st.d $a0, $sp, 32
+; CHECK-NEXT:    st.d $zero, $sp, 88
 ; CHECK-NEXT:    ori $a0, $zero, 8
-; CHECK-NEXT:    st.d $a0, $sp, 48
+; CHECK-NEXT:    st.d $a0, $sp, 64
 ; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    ori $a1, $zero, 2
 ; CHECK-NEXT:    ori $a2, $zero, 3
@@ -198,12 +198,12 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-NEXT:    ori $a4, $zero, 5
 ; CHECK-NEXT:    ori $a5, $zero, 6
 ; CHECK-NEXT:    ori $a6, $zero, 7
-; CHECK-NEXT:    addi.d $a7, $sp, 48
-; CHECK-NEXT:    vst $vr0, $sp, 56
+; CHECK-NEXT:    addi.d $a7, $sp, 64
+; CHECK-NEXT:    vst $vr0, $sp, 72
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars_exhausted_regs)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars_exhausted_regs(
       i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9,
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index c88b67f13d1e..da8c3e93f684 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -1252,8 +1252,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64F-LP64S-LABEL: caller_half_on_stack:
 ; LA64F-LP64S:       # %bb.0:
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64F-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64F-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64F-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64F-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1292,8 +1292,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64F-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64F-LP64S-NEXT:    ret
 ;
 ; LA64F-LP64D-LABEL: caller_half_on_stack:
@@ -1336,8 +1336,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64D-LP64S-LABEL: caller_half_on_stack:
 ; LA64D-LP64S:       # %bb.0:
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64D-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64D-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64D-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64D-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1376,8 +1376,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64D-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64D-LP64S-NEXT:    ret
 ;
 ; LA64D-LP64D-LABEL: caller_half_on_stack:
diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
index 52d8dd05aaa4..1a9de3b0ef3d 100644
--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
@@ -14,41 +14,41 @@
 define dso_local noundef signext i32 @main() nounwind {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -272
-; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -288
+; CHECK-NEXT:    st.d $ra, $sp, 280 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
 ; CHECK-NEXT:    xvld $xr0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvst $xr0, $sp, 96 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 112 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvst $xr1, $sp, 64 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr1, $sp, 80 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_2)
 ; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_2)
-; CHECK-NEXT:    xvst $xr2, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr2, $sp, 48 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_3)
 ; CHECK-NEXT:    xvld $xr3, $a0, %pc_lo12(.LCPI0_3)
-; CHECK-NEXT:    xvst $xr3, $sp, 0 # 32-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvst $xr1, $sp, 168
-; CHECK-NEXT:    xvst $xr2, $sp, 200
-; CHECK-NEXT:    xvst $xr3, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvst $xr3, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvst $xr1, $sp, 184
+; CHECK-NEXT:    xvst $xr2, $sp, 216
+; CHECK-NEXT:    xvst $xr3, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(foo)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 168
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 200
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvld $xr0, $sp, 112 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 184
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 216
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(bar)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 272
+; CHECK-NEXT:    ld.d $ra, $sp, 280 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 288
 ; CHECK-NEXT:    ret
 entry:
   %s = alloca %struct.S, align 2
diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
index ccc5c703e71e..15ac95dfc6c5 100644
--- a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
+++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
@@ -28,12 +28,12 @@ define void @func() {
 ; CHECK-NEXT:    ld.w $a3, $a1, 0
 ; CHECK-NEXT:    ld.w $a2, $a1, 0
 ; CHECK-NEXT:    ld.w $a0, $a1, 0
-; CHECK-NEXT:    st.d $fp, $sp, 0
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
 ; CHECK-NEXT:    lu12i.w $fp, 1
 ; CHECK-NEXT:    ori $fp, $fp, 12
 ; CHECK-NEXT:    add.d $fp, $sp, $fp
 ; CHECK-NEXT:    st.w $t8, $fp, 0
-; CHECK-NEXT:    ld.d $fp, $sp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
 ; CHECK-NEXT:    st.w $t8, $a1, 0
 ; CHECK-NEXT:    st.w $t7, $a1, 0
 ; CHECK-NEXT:    st.w $t6, $a1, 0
diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll
index 048703029d8c..b29d8634854f 100644
--- a/llvm/test/CodeGen/LoongArch/frame.ll
+++ b/llvm/test/CodeGen/LoongArch/frame.ll
@@ -1,5 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,-lsx < %s | FileCheck %s --check-prefixes=CHECK,NOLSX
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LSX
 
 %struct.key_t = type { i32, [16 x i8] }
 
@@ -7,20 +8,35 @@ declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
 declare void @test1(ptr)
 
 define i32 @test() nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -32
-; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; CHECK-NEXT:    st.w $zero, $sp, 16
-; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 4
-; CHECK-NEXT:    pcaddu18i $ra, %call36(test1)
-; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 32
-; CHECK-NEXT:    ret
+; NOLSX-LABEL: test:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -32
+; NOLSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NOLSX-NEXT:    st.w $zero, $sp, 16
+; NOLSX-NEXT:    st.d $zero, $sp, 8
+; NOLSX-NEXT:    st.d $zero, $sp, 0
+; NOLSX-NEXT:    addi.d $a0, $sp, 4
+; NOLSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; NOLSX-NEXT:    jirl $ra, $ra, 0
+; NOLSX-NEXT:    move $a0, $zero
+; NOLSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NOLSX-NEXT:    addi.d $sp, $sp, 32
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -32
+; LSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LSX-NEXT:    st.w $zero, $sp, 16
+; LSX-NEXT:    vrepli.b $vr0, 0
+; LSX-NEXT:    vst $vr0, $sp, 0
+; LSX-NEXT:    addi.d $a0, $sp, 4
+; LSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; LSX-NEXT:    jirl $ra, $ra, 0
+; LSX-NEXT:    move $a0, $zero
+; LSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LSX-NEXT:    addi.d $sp, $sp, 32
+; LSX-NEXT:    ret
   %key = alloca %struct.key_t, align 4
   call void @llvm.memset.p0.i64(ptr %key, i8 0, i64 20, i1 false)
   %1 = getelementptr inbounds %struct.key_t, ptr %key, i64 0, i32 1, i64 0
@@ -98,3 +114,62 @@ define void @test_large_frame_size_1234576() "frame-pointer"="all" {
   %1 = alloca i8, i32 1234567
   ret void
 }
+
+;; Note: will create an emergency spill slot, if (!isInt<7>(StackSize)).
+;; Should involve only one SP-adjusting addi per adjustment.
+;; LSX 112 + 16(emergency solt) = 128
+define void @test_frame_size_112() {
+; NOLSX-LABEL: test_frame_size_112:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -112
+; NOLSX-NEXT:    .cfi_def_cfa_offset 112
+; NOLSX-NEXT:    addi.d $sp, $sp, 112
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_112:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -128
+; LSX-NEXT:    .cfi_def_cfa_offset 128
+; LSX-NEXT:    addi.d $sp, $sp, 128
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 112
+  ret void
+}
+
+;; LSX 128 + 16(emergency solt) = 144
+define void @test_frame_size_128() {
+; NOLSX-LABEL: test_frame_size_128:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -128
+; NOLSX-NEXT:    .cfi_def_cfa_offset 128
+; NOLSX-NEXT:    addi.d $sp, $sp, 128
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_128:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -144
+; LSX-NEXT:    .cfi_def_cfa_offset 144
+; LSX-NEXT:    addi.d $sp, $sp, 144
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 128
+  ret void
+}
+
+;; LSX 144 + 16(emergency solt) = 160
+define void @test_frame_size_144() {
+; NOLSX-LABEL: test_frame_size_144:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -144
+; NOLSX-NEXT:    .cfi_def_cfa_offset 144
+; NOLSX-NEXT:    addi.d $sp, $sp, 144
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_144:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -160
+; LSX-NEXT:    .cfi_def_cfa_offset 160
+; LSX-NEXT:    addi.d $sp, $sp, 160
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 144
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 402ddb9ad941..5a55b253c77b 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -6,11 +6,11 @@
 define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
 ; CHECK-LABEL: box:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    slli.d $a2, $a1, 5
 ; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
-; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 16
 ; CHECK-NEXT:    add.d $a3, $a2, $a1
 ; CHECK-NEXT:    vldx $vr0, $a1, $a2
 ; CHECK-NEXT:    vld $vr1, $a3, 32
@@ -18,7 +18,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $a0, 32
 ; CHECK-NEXT:    vst $vr2, $a0, 16
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = alloca [2 x %Box], align 16
   %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 789b51d9b5e5..9528280d181a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -6,10 +6,10 @@ declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
 define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
@@ -18,79 +18,79 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 3
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 4
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 4
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 5
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 6
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 6
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 7
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b)
@@ -102,10 +102,10 @@ declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
 define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
@@ -114,39 +114,39 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 3
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 04214f5dfa9d..2e1618748688 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -76,21 +76,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
 ; CHECK-NEXT:    ld.b $a0, $a0, 0
 ; CHECK-NEXT:    st.b $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %e = extractelement <32 x i8> %v, i32 %idx
@@ -101,21 +101,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %e = extractelement <16 x i16> %v, i32 %idx
@@ -126,21 +126,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -151,21 +151,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %e = extractelement <4 x i64> %v, i32 %idx
@@ -176,21 +176,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fld.s $fa0, $a0, 0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx
@@ -201,21 +201,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fld.d $fa0, $a0, 0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %e = extractelement <4 x double> %v, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index 3fdc439e6867..c3d09953fbc4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -4,18 +4,18 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a0, $sp, 31
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.b $a0, $sp, 63
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <32 x i8> %a, i32 31
@@ -26,18 +26,18 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 30
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.h $a0, $sp, 62
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i16> %a, i32 15
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 88c3e4367ffa..f2d8dda1850b 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,23 +4,23 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 31
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.b $a1, $sp, 63
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
 ; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
 ; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <32 x i8> %a, i32 15
@@ -33,23 +33,23 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a1, $sp, 30
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.h $a1, $sp, 62
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
 ; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
 ; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
 ; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <16 x i16> %a, i32 7
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f..a6c0b332abcb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -116,22 +116,22 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
 define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
 ; CHECK-NEXT:    st.b $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx
@@ -142,22 +142,22 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
 ; CHECK-NEXT:    st.h $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx
@@ -168,22 +168,22 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
 ; CHECK-NEXT:    st.w $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx
@@ -194,22 +194,22 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
 ; CHECK-NEXT:    st.d $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx
@@ -220,22 +220,22 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fst.s $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %v_new = insertelement <8 x float> %v, float %in, i32 %idx
@@ -246,22 +246,22 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fst.d $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %v_new = insertelement <4 x double> %v, double %in, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index ffedd7f9e943..648c19d50971 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -347,42 +347,42 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr1, $vr0, $vr1
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
@@ -439,48 +439,48 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v3f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -96
-; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -112
+; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 72
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 88
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 68
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 84
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 64
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 80
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 56
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 72
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 52
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 68
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 48
-; LA64-NEXT:    vld $vr0, $sp, 64
-; LA64-NEXT:    vld $vr1, $sp, 48
-; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 96
+; LA64-NEXT:    fst.s $fa0, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 80
+; LA64-NEXT:    vld $vr1, $sp, 64
+; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 112
 ; LA64-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
@@ -568,44 +568,44 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 1
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr1, $a0, 1
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
@@ -801,17 +801,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -80
-; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s6, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s7, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -96
+; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s4, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s5, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s6, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s7, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    ld.d $fp, $a1, 16
 ; LA64-NEXT:    ld.d $s0, $a1, 24
 ; LA64-NEXT:    ld.d $s1, $a1, 0
@@ -847,17 +847,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LA64-NEXT:    st.d $s6, $s3, 16
 ; LA64-NEXT:    st.d $s5, $s3, 8
 ; LA64-NEXT:    st.d $s4, $s3, 0
-; LA64-NEXT:    ld.d $s7, $sp, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s6, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ld.d $s7, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s6, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s5, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s4, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s3, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 96
 ; LA64-NEXT:    ret
   %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a)
   ret { <2 x fp128>, <2 x fp128> } %result
diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
new file mode 100644
index 000000000000..96159e5884d3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx --verify-machineinstrs < %s | FileCheck %s
+define void @eliminate_frame_index(<16 x i8> %a) nounwind {
+; CHECK-LABEL: eliminate_frame_index:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -240
+; CHECK-NEXT:    st.d $ra, $sp, 232 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 224 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 216 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 208 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 200 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s3, $sp, 192 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s4, $sp, 184 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s5, $sp, 176 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s6, $sp, 168 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s7, $sp, 160 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s8, $sp, 152 # 8-byte Folded Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $zero, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $ra, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $tp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $fp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    st.d $a0, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    vstelm.b $vr0, $a0, 0, 0
+; CHECK-NEXT:    ld.d $a0, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $zero
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $ra
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $tp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $fp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld.d $s8, $sp, 152 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s7, $sp, 160 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s6, $sp, 168 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s5, $sp, 176 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s4, $sp, 184 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s3, $sp, 192 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s2, $sp, 200 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 208 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 216 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 224 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 232 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 240
+; CHECK-NEXT:    ret
+  %s = alloca [16 x i8]
+  %ss = alloca [128 x i8]
+
+  %zero =  call i64 asm sideeffect "addi.d $$zero, $$zero, 1", "={r0}"()
+  %ra =  call i64 asm sideeffect "addi.d $$ra, $$zero, 1", "={r1}"()
+  %tp =  call i64 asm sideeffect "addi.d $$tp, $$zero, 1", "={r2}"()
+  %a0 =  call i64 asm sideeffect "addi.d $$a0, $$zero, 1", "={r4}"()
+  %a1 =  call i64 asm sideeffect "addi.d $$a1, $$zero, 1", "={r5}"()
+  %a2 =  call i64 asm sideeffect "addi.d $$a2, $$zero, 1", "={r6}"()
+  %a3 =  call i64 asm sideeffect "addi.d $$a3, $$zero, 1", "={r7}"()
+  %a4 =  call i64 asm sideeffect "addi.d $$a4, $$zero, 1", "={r8}"()
+  %a5 =  call i64 asm sideeffect "addi.d $$a5, $$zero, 1", "={r9}"()
+  %a6 =  call i64 asm sideeffect "addi.d $$a6, $$zero, 1", "={r10}"()
+  %a7 =  call i64 asm sideeffect "addi.d $$a7, $$zero, 1", "={r11}"()
+  %t0 =  call i64 asm sideeffect "addi.d $$t0, $$zero, 1", "={r12}"()
+  %t1 =  call i64 asm sideeffect "addi.d $$t1, $$zero, 1", "={r13}"()
+  %t2 =  call i64 asm sideeffect "addi.d $$t2, $$zero, 1", "={r14}"()
+  %t3 =  call i64 asm sideeffect "addi.d $$t3, $$zero, 1", "={r15}"()
+  %t4 =  call i64 asm sideeffect "addi.d $$t4, $$zero, 1", "={r16}"()
+  %t5 =  call i64 asm sideeffect "addi.d $$t5, $$zero, 1", "={r17}"()
+  %t6 =  call i64 asm sideeffect "addi.d $$t6, $$zero, 1", "={r18}"()
+  %t7 =  call i64 asm sideeffect "addi.d $$t7, $$zero, 1", "={r19}"()
+  %t8 =  call i64 asm sideeffect "addi.d $$t8, $$zero, 1", "={r20}"()
+  ;; r21 Reserved (Non-allocatable)
+  %s9 =  call i64 asm sideeffect "addi.d $$s9, $$zero, 1", "={r22}"()
+  %s0 =  call i64 asm sideeffect "addi.d $$s0, $$zero, 1", "={r23}"()
+  %s1 =  call i64 asm sideeffect "addi.d $$s1, $$zero, 1", "={r24}"()
+  %s2 =  call i64 asm sideeffect "addi.d $$s2, $$zero, 1", "={r25}"()
+  %s3 =  call i64 asm sideeffect "addi.d $$s3, $$zero, 1", "={r26}"()
+  %s4 =  call i64 asm sideeffect "addi.d $$s4, $$zero, 1", "={r27}"()
+  %s5 =  call i64 asm sideeffect "addi.d $$s5, $$zero, 1", "={r28}"()
+  %s6 =  call i64 asm sideeffect "addi.d $$s6, $$zero, 1", "={r29}"()
+  %s7 =  call i64 asm sideeffect "addi.d $$s7, $$zero, 1", "={r30}"()
+  %s8 =  call i64 asm sideeffect "addi.d $$s8, $$zero, 1", "={r31}"()
+
+  %e = extractelement <16 x i8> %a, i64 0
+
+  store volatile i8 %e, ptr %s
+
+  call void asm sideeffect "# reg use $0", "{r0}"(i64 %zero)
+  call void asm sideeffect "# reg use $0", "{r1}"(i64 %ra)
+  call void asm sideeffect "# reg use $0", "{r2}"(i64 %tp)
+  call void asm sideeffect "# reg use $0", "{r4}"(i64 %a0)
+  call void asm sideeffect "# reg use $0", "{r5}"(i64 %a1)
+  call void asm sideeffect "# reg use $0", "{r6}"(i64 %a2)
+  call void asm sideeffect "# reg use $0", "{r7}"(i64 %a3)
+  call void asm sideeffect "# reg use $0", "{r8}"(i64 %a4)
+  call void asm sideeffect "# reg use $0", "{r9}"(i64 %a5)
+  call void asm sideeffect "# reg use $0", "{r10}"(i64 %a6)
+  call void asm sideeffect "# reg use $0", "{r11}"(i64 %a7)
+  call void asm sideeffect "# reg use $0", "{r12}"(i64 %t0)
+  call void asm sideeffect "# reg use $0", "{r13}"(i64 %t1)
+  call void asm sideeffect "# reg use $0", "{r14}"(i64 %t2)
+  call void asm sideeffect "# reg use $0", "{r15}"(i64 %t3)
+  call void asm sideeffect "# reg use $0", "{r16}"(i64 %t4)
+  call void asm sideeffect "# reg use $0", "{r17}"(i64 %t5)
+  call void asm sideeffect "# reg use $0", "{r18}"(i64 %t6)
+  call void asm sideeffect "# reg use $0", "{r19}"(i64 %t7)
+  call void asm sideeffect "# reg use $0", "{r20}"(i64 %t8)
+  ;; r21 Reserved (Non-allocatable)
+  call void asm sideeffect "# reg use $0", "{r22}"(i64 %s9)
+  call void asm sideeffect "# reg use $0", "{r23}"(i64 %s0)
+  call void asm sideeffect "# reg use $0", "{r24}"(i64 %s1)
+  call void asm sideeffect "# reg use $0", "{r25}"(i64 %s2)
+  call void asm sideeffect "# reg use $0", "{r26}"(i64 %s3)
+  call void asm sideeffect "# reg use $0", "{r27}"(i64 %s4)
+  call void asm sideeffect "# reg use $0", "{r28}"(i64 %s5)
+  call void asm sideeffect "# reg use $0", "{r29}"(i64 %s6)
+  call void asm sideeffect "# reg use $0", "{r30}"(i64 %s7)
+  call void asm sideeffect "# reg use $0", "{r31}"(i64 %s8)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index 0ee30120f77a..ad57bbf9ee5c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
   %res = bitcast <2 x i1> %y to i2
   ret i2 %res
 }
+
+define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: vmsk_eq_allzeros_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vseqi.b $vr0, $vr0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
+; CHECK-NEXT:    vmskltz.w $vr0, $vr0
+; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
+; CHECK-NEXT:    ret
+  %1 = icmp eq <4 x i8> %a, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  ret i4 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
index 9f15604fcca6..69995a0721f8 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
@@ -36,15 +36,15 @@ define void @caller(i32 %n) {
 ;
 ; LA64-LABEL: caller:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s8, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s8, $sp, 104 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    .cfi_offset 31, -24
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
 ; LA64-NEXT:    move $s8, $sp
@@ -54,14 +54,14 @@ define void @caller(i32 %n) {
 ; LA64-NEXT:    slli.d $a0, $a0, 4
 ; LA64-NEXT:    sub.d $a0, $sp, $a0
 ; LA64-NEXT:    move $sp, $a0
-; LA64-NEXT:    addi.d $a1, $s8, 0
+; LA64-NEXT:    addi.d $a1, $s8, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $s8, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $s8, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, i32 %n
   %2 = alloca i32, align 64
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 0645339358b6..0188884543ad 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -28,22 +28,22 @@ define void @caller32() {
 ;
 ; LA64-LABEL: caller32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -32
-; LA64-NEXT:    .cfi_def_cfa_offset 32
-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 32
+; LA64-NEXT:    addi.d $fp, $sp, 64
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 4, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 32
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -32
-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    addi.d $sp, $fp, -64
+; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 64
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 32
   call void @callee(ptr %1)
@@ -102,22 +102,22 @@ define void @caller64() {
 ;
 ; LA64-LABEL: caller64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 64
   call void @callee(ptr %1)
@@ -176,22 +176,22 @@ define void @caller128() {
 ;
 ; LA64-LABEL: caller128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    .cfi_def_cfa_offset 128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -256
+; LA64-NEXT:    .cfi_def_cfa_offset 256
+; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 128
+; LA64-NEXT:    addi.d $fp, $sp, 256
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 6, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 128
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -128
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
+; LA64-NEXT:    addi.d $sp, $fp, -256
+; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 256
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 128
   call void @callee(ptr %1)
@@ -250,22 +250,22 @@ define void @caller256() {
 ;
 ; LA64-LABEL: caller256:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -256
-; LA64-NEXT:    .cfi_def_cfa_offset 256
-; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -512
+; LA64-NEXT:    .cfi_def_cfa_offset 512
+; LA64-NEXT:    st.d $ra, $sp, 504 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 496 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 256
+; LA64-NEXT:    addi.d $fp, $sp, 512
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 7, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 256
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -256
-; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 256
+; LA64-NEXT:    addi.d $sp, $fp, -512
+; LA64-NEXT:    ld.d $fp, $sp, 496 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 504 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 512
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 256
   call void @callee(ptr %1)
diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
index eb656ad94e28..6e9d26ab362d 100644
--- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
+++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
@@ -24,9 +24,9 @@
 ; NO-WARNING-NOT:  warning: triple-implied ABI conflicts with provided target-abi 'lp64d', using target-abi
 
 ;; Check that ILP32-on-LA64 and LP64-on-LA32 combinations are handled properly.
-; RUN: llc --mtriple=loongarch64 --target-abi=ilp32d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch64-linux-gnu --target-abi=ilp32d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=LP64D,32ON64
-; RUN: llc --mtriple=loongarch32 --target-abi=lp64d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch32-linux-gnu --target-abi=lp64d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=ILP32D,64ON32
 
 ; 32ON64: warning: 32-bit ABIs are not supported for 64-bit targets, ignoring and using triple-implied ABI
@@ -49,12 +49,6 @@
 
 ; LP64D-LP64F-NOF: warning: both target-abi and the triple-implied ABI are invalid, ignoring and using feature-implied ABI
 
-;; Check that triple-implied ABI are invalid, use feature-implied ABI
-; RUN: llc --mtriple=loongarch64 --mattr=-f < %s 2>&1 \
-; RUN:   | FileCheck %s --check-prefixes=LP64S,LP64D-NONE-NOF
-
-; LP64D-NONE-NOF: warning: the triple-implied ABI is invalid, ignoring and using feature-implied ABI
-
 define float @f(float %a) {
 ; ILP32D-LABEL: f:
 ; ILP32D:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 925fdf3d6064..0d441e66a0c8 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -121,19 +121,19 @@ define void @t3() {
 ;
 ; LA64-LABEL: t3:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    .cfi_def_cfa_offset 80
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
 ; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.L.str)
 ; LA64-NEXT:    ld.h $a1, $a0, 20
 ; LA64-NEXT:    ld.w $a2, $a0, 16
 ; LA64-NEXT:    ld.d $a3, $a0, 8
 ; LA64-NEXT:    ld.d $a0, $a0, 0
-; LA64-NEXT:    st.h $a1, $sp, 20
-; LA64-NEXT:    st.w $a2, $sp, 16
-; LA64-NEXT:    st.d $a3, $sp, 8
-; LA64-NEXT:    st.d $a0, $sp, 0
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    st.h $a1, $sp, 36
+; LA64-NEXT:    st.w $a2, $sp, 32
+; LA64-NEXT:    st.d $a3, $sp, 24
+; LA64-NEXT:    st.d $a0, $sp, 16
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
 entry:
   %msgbuf = alloca [64 x i8], align 1
diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll
index 939cd2015c5b..bc4b8a77c7e1 100644
--- a/llvm/test/CodeGen/LoongArch/vararg.ll
+++ b/llvm/test/CodeGen/LoongArch/vararg.ll
@@ -47,7 +47,7 @@ define i64 @va1(ptr %fmt, ...) {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -94,7 +94,7 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -112,11 +112,11 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-LABEL: va1_va_arg_alloca:
 ; LA64-FPELIM:       # %bb.0:
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -96
-; LA64-FPELIM-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 32
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -112
+; LA64-FPELIM-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 48
 ; LA64-FPELIM-NEXT:    move $s0, $a1
 ; LA64-FPELIM-NEXT:    st.d $a7, $fp, 56
 ; LA64-FPELIM-NEXT:    st.d $a6, $fp, 48
@@ -126,7 +126,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    st.d $a2, $fp, 16
 ; LA64-FPELIM-NEXT:    st.d $a1, $fp, 8
 ; LA64-FPELIM-NEXT:    addi.d $a0, $fp, 16
-; LA64-FPELIM-NEXT:    st.d $a0, $fp, -32
+; LA64-FPELIM-NEXT:    st.d $a0, $fp, -40
 ; LA64-FPELIM-NEXT:    addi.d $a0, $a1, 15
 ; LA64-FPELIM-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-FPELIM-NEXT:    sub.d $a0, $sp, $a0
@@ -134,20 +134,20 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-FPELIM-NEXT:    jirl $ra, $ra, 0
 ; LA64-FPELIM-NEXT:    move $a0, $s0
-; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -32
-; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 96
+; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -48
+; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 112
 ; LA64-FPELIM-NEXT:    ret
 ;
 ; LA64-WITHFP-LABEL: va1_va_arg_alloca:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -96
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 32
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 48
 ; LA64-WITHFP-NEXT:    move $s0, $a1
 ; LA64-WITHFP-NEXT:    st.d $a7, $fp, 56
 ; LA64-WITHFP-NEXT:    st.d $a6, $fp, 48
@@ -157,7 +157,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a0, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a0, $fp, -32
+; LA64-WITHFP-NEXT:    st.d $a0, $fp, -40
 ; LA64-WITHFP-NEXT:    addi.d $a0, $a1, 15
 ; LA64-WITHFP-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-WITHFP-NEXT:    sub.d $a0, $sp, $a0
@@ -165,11 +165,11 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
 ; LA64-WITHFP-NEXT:    move $a0, $s0
-; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -32
-; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
+; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -48
+; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
 ; LA64-WITHFP-NEXT:    ret
   %va = alloca ptr, align 8
   call void @llvm.va_start(ptr %va)
@@ -314,10 +314,10 @@ define void @va_aligned_stack_caller() nounwind {
 ;
 ; LA64-WITHFP-LABEL: va_aligned_stack_caller:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 112
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -128
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 128
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 17
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 48
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 16
@@ -336,23 +336,23 @@ define void @va_aligned_stack_caller() nounwind {
 ; LA64-WITHFP-NEXT:    lu32i.d $a0, 335544
 ; LA64-WITHFP-NEXT:    lu52i.d $a0, $a0, -328
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 16
-; LA64-WITHFP-NEXT:    st.d $zero, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $zero, $fp, -40
 ; LA64-WITHFP-NEXT:    vrepli.b $vr0, 0
-; LA64-WITHFP-NEXT:    vst $vr0, $fp, -40
+; LA64-WITHFP-NEXT:    vst $vr0, $fp, -56
 ; LA64-WITHFP-NEXT:    ori $a5, $zero, 1000
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 1
 ; LA64-WITHFP-NEXT:    ori $a1, $zero, 11
-; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -48
+; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -64
 ; LA64-WITHFP-NEXT:    ori $a3, $zero, 12
 ; LA64-WITHFP-NEXT:    ori $a4, $zero, 13
 ; LA64-WITHFP-NEXT:    ori $a7, $zero, 1
-; LA64-WITHFP-NEXT:    st.d $a5, $fp, -48
+; LA64-WITHFP-NEXT:    st.d $a5, $fp, -64
 ; LA64-WITHFP-NEXT:    move $a6, $zero
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(va_aligned_stack_callee)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 128
 ; LA64-WITHFP-NEXT:    ret
   %1 = call i32 (i32, ...) @va_aligned_stack_callee(i32 1, i32 11,
     i256 1000, i32 12, i32 13, i128 18446744073709551616, i32 14,
diff --git a/llvm/test/CodeGen/Mips/abiflags-soft-float.ll b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
new file mode 100644
index 000000000000..01821f2d9b6c
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
@@ -0,0 +1,12 @@
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o tmp.o
+; RUN: llvm-readobj -A tmp.o | FileCheck %s -check-prefix=OBJ
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | \
+; RUN: FileCheck %s -check-prefix=ASM
+
+; OBJ: FP ABI: Soft float
+; ASM: .module	softfloat 
+
+define dso_local void @asm_is_null() "use-soft-float"="true" {
+  call void asm sideeffect "", ""()
+  ret void
+}
diff --git a/llvm/test/CodeGen/Mips/nan_lowering.ll b/llvm/test/CodeGen/Mips/nan_lowering.ll
new file mode 100644
index 000000000000..2a11278e14b6
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/nan_lowering.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=mips-linux-gnu -mattr=-nan2008 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-linux-gnu -mattr=+nan2008 < %s | FileCheck %s
+
+; Make sure that lowering does not corrupt the value of NaN values,
+; regardless of what the NaN mode is.
+
+define float @test1() {
+; CHECK: .4byte 0x7fc00000
+  ret float bitcast (i32 u0x7fc00000 to float)
+}
+
+define float @test2() {
+; CHECK: .4byte 0x7fc00001
+  ret float bitcast (i32 u0x7fc00001 to float)
+}
+
+define float @test3() {
+; CHECK: .4byte 0x7f800000
+  ret float bitcast (i32 u0x7f800000 to float)
+}
+
+define float @test4() {
+; CHECK: .4byte 0x7f800001
+  ret float bitcast (i32 u0x7f800001 to float)
+}
diff --git a/llvm/test/CodeGen/Mips/qnan.ll b/llvm/test/CodeGen/Mips/qnan.ll
deleted file mode 100644
index e5b4aa1b42ee..000000000000
--- a/llvm/test/CodeGen/Mips/qnan.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu < %s -o - | FileCheck %s -check-prefixes=MIPS_Legacy
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu -mattr=+nan2008 < %s -o - | FileCheck %s -check-prefixes=MIPS_NaN2008
-
-define dso_local float @nan(float noundef %a, float noundef %b) local_unnamed_addr #0 {
-; MIPS_Legacy: $CPI0_0:
-; MIPS_Legacy-NEXT: .4byte  0x7fa00000 # float NaN
-
-; MIPS_NaN2008: $CPI0_0:
-; MIPS_NaN2008-NEXT: .4byte  0x7fc00000 # float NaN
-
-entry:
-  %0 = tail call float @llvm.minimum.f32(float %a, float %b)
-  ret float %0
-}
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc3489c0d..9a051b3fd8bb 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
 ; CHECK-LABEL: test_select_i1_basic_folding(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<12>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .pred %p<13>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
 ; CHECK-NEXT:    setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT:    setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT:    setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT:    ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT:    setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT:    setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT:    setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
 ; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
-; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
 ; CHECK-NEXT:    and.pred %p7, %p6, %p4;
-; CHECK-NEXT:    and.pred %p8, %p2, %p4;
-; CHECK-NEXT:    and.pred %p9, %p3, %p7;
-; CHECK-NEXT:    or.pred %p10, %p9, %p8;
-; CHECK-NEXT:    xor.pred %p11, %p10, %p3;
-; CHECK-NEXT:    selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    and.pred %p9, %p2, %p4;
+; CHECK-NEXT:    and.pred %p10, %p3, %p7;
+; CHECK-NEXT:    or.pred %p11, %p10, %p9;
+; CHECK-NEXT:    xor.pred %p12, %p11, %p3;
+; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
   %b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb1c0b8..44d85589b505 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
 define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: srem_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<126>;
+; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
 ; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
 ; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT:    mov.b64 %rd116, 0;
+; CHECK-NEXT:    mov.b64 %rd117, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT:    selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB0_5;
+; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd72, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd66;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT:    or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT:    shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT:    mov.b64 %rd113, %rd116;
-; CHECK-NEXT:    @%p18 bra $L__BB0_4;
+; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT:    mov.b64 %rd114, %rd117;
+; CHECK-NEXT:    @%p16 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT:    shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT:    or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT:    shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd113, 0;
-; CHECK-NEXT:    mov.b64 %rd116, %rd113;
+; CHECK-NEXT:    mov.b64 %rd114, 0;
+; CHECK-NEXT:    mov.b64 %rd117, %rd114;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT:    shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT:    or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT:    shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT:    shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT:    or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT:    shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT:    shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT:    or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT:    sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT:    subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT:    shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT:    and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT:    and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT:    and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT:    subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB0_4;
+; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT:    shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT:    or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT:    mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT:    mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT:    mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT:    xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
 ; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<111>;
+; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd101, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd103, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd98, %rd101;
+; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd100, %rd103;
 ; CHECK-NEXT:    @%p14 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd98, 0;
-; CHECK-NEXT:    mov.b64 %rd101, %rd98;
+; CHECK-NEXT:    mov.b64 %rd100, 0;
+; CHECK-NEXT:    mov.b64 %rd103, %rd100;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT:    or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT:    or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT:    mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT:    mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT:    sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT:    subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: sdiv_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<121>;
+; CHECK-NEXT:    .reg .b64 %rd<122>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    mov.b64 %rd111, 0;
+; CHECK-NEXT:    mov.b64 %rd112, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT:    selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB4_5;
+; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd73, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd67;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT:    shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT:    mov.b64 %rd108, %rd111;
-; CHECK-NEXT:    @%p18 bra $L__BB4_4;
+; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT:    mov.b64 %rd109, %rd112;
+; CHECK-NEXT:    @%p16 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT:    shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT:    shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd108, 0;
-; CHECK-NEXT:    mov.b64 %rd111, %rd108;
+; CHECK-NEXT:    mov.b64 %rd109, 0;
+; CHECK-NEXT:    mov.b64 %rd112, %rd109;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT:    or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB4_4;
+; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT:    or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd103, %rd119, %rd5;
 ; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<105>;
+; CHECK-NEXT:    .reg .b64 %rd<107>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd95, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd97, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd92, %rd95;
+; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd94, %rd97;
 ; CHECK-NEXT:    @%p14 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.b64 %rd92, 0;
-; CHECK-NEXT:    mov.b64 %rd95, %rd92;
+; CHECK-NEXT:    mov.b64 %rd94, 0;
+; CHECK-NEXT:    mov.b64 %rd97, %rd94;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT:    or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT:    or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
index 41e21248a3f0..2796cdb3ae87 100644
--- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
+++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
@@ -1,6 +1,12 @@
 # RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
 # RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
 # RUN:   -o - | FileCheck %s
+# RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
+# RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+# RUN: llc -mcpu=pwr10 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
+# RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
 
 --- |
   ; ModuleID = 'a.ll'
@@ -30,7 +36,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
   
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
   attributes #1 = { nounwind }
   
   !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll b/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll
new file mode 100644
index 000000000000..c119da6e050a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64
+
+define ptr @lower_args(ptr %_0, i32 %0, i32 %1, i32 %2, i32 %3, ptr %4, ptr %5, i64 %6, i24 %7) {
+; CHECK-LABEL: lower_args:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  ret ptr %_0
+}
+
+define i32 @lower_args_withops_zeroext(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i24 %i) {
+; CHECK32-LABEL: lower_args_withops_zeroext:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    lwz r3, 56(r1)
+; CHECK32-NEXT:    addi r3, r3, 255
+; CHECK32-NEXT:    clrlwi r3, r3, 8
+; CHECK32-NEXT:    blr
+;
+; CHECK64-LABEL: lower_args_withops_zeroext:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    lwz r3, 116(r1)
+; CHECK64-NEXT:    addi r3, r3, 255
+; CHECK64-NEXT:    clrldi r3, r3, 40
+; CHECK64-NEXT:    blr
+entry:
+  %0 = add i24 %i, 255
+  %1 = zext i24 %0 to i32
+  ret i32 %1
+}
+
+define i32 @lower_args_withops_signext(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i24 signext %i) {
+; CHECK32-LABEL: lower_args_withops_signext:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    lwz r3, 56(r1)
+; CHECK32-NEXT:    slwi r3, r3, 8
+; CHECK32-NEXT:    srawi r3, r3, 8
+; CHECK32-NEXT:    slwi r3, r3, 8
+; CHECK32-NEXT:    addi r3, r3, 22272
+; CHECK32-NEXT:    srawi r3, r3, 8
+; CHECK32-NEXT:    blr
+;
+; CHECK64-LABEL: lower_args_withops_signext:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    lwz r3, 116(r1)
+; CHECK64-NEXT:    slwi r3, r3, 8
+; CHECK64-NEXT:    srawi r3, r3, 8
+; CHECK64-NEXT:    addi r3, r3, 87
+; CHECK64-NEXT:    sldi r3, r3, 40
+; CHECK64-NEXT:    sradi r3, r3, 40
+; CHECK64-NEXT:    blr
+entry:
+  %0 = add i24 %i, 87
+  %1 = sext i24 %0 to i32
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
new file mode 100644
index 000000000000..238e200bfc78
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le < %s | FileCheck %s
+
+define void @test(ptr %p1, ptr %p2) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -224(1)
+; CHECK-NEXT:    li 5, 48
+; CHECK-NEXT:    std 0, 240(1)
+; CHECK-NEXT:    std 27, 184(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 27, 16
+; CHECK-NEXT:    std 28, 192(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 200(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 29, 32
+; CHECK-NEXT:    li 28, 48
+; CHECK-NEXT:    stxvd2x 56, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 64
+; CHECK-NEXT:    std 30, 208(1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr 30, 4
+; CHECK-NEXT:    stxvd2x 57, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 80
+; CHECK-NEXT:    stxvd2x 58, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 96
+; CHECK-NEXT:    lxvd2x 58, 0, 3
+; CHECK-NEXT:    stxvd2x 59, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 112
+; CHECK-NEXT:    lxvd2x 59, 3, 27
+; CHECK-NEXT:    stxvd2x 60, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 128
+; CHECK-NEXT:    stxvd2x 61, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 144
+; CHECK-NEXT:    stxvd2x 62, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 160
+; CHECK-NEXT:    lxvd2x 62, 3, 28
+; CHECK-NEXT:    stxvd2x 63, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    lxvd2x 63, 3, 29
+; CHECK-NEXT:    xxswapd 57, 58
+; CHECK-NEXT:    xxswapd 1, 59
+; CHECK-NEXT:    xxswapd 60, 62
+; CHECK-NEXT:    xxswapd 61, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 56, 1
+; CHECK-NEXT:    xxlor 1, 59, 59
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 60, 60
+; CHECK-NEXT:    xxmrgld 59, 0, 56
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 60, 1
+; CHECK-NEXT:    xxlor 1, 62, 62
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 61, 61
+; CHECK-NEXT:    xxmrgld 62, 0, 60
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 63, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 57, 57
+; CHECK-NEXT:    xxmrgld 63, 0, 61
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 58, 58
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li 3, 160
+; CHECK-NEXT:    stxvd2x 63, 30, 29
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    stxvd2x 62, 30, 28
+; CHECK-NEXT:    stxvd2x 59, 30, 27
+; CHECK-NEXT:    ld 29, 200(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, 192(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 27, 184(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 144
+; CHECK-NEXT:    xxmrgld 0, 0, 61
+; CHECK-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 128
+; CHECK-NEXT:    stxvd2x 0, 0, 30
+; CHECK-NEXT:    ld 30, 208(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 112
+; CHECK-NEXT:    lxvd2x 60, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 96
+; CHECK-NEXT:    lxvd2x 59, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 80
+; CHECK-NEXT:    lxvd2x 58, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 64
+; CHECK-NEXT:    lxvd2x 57, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 48
+; CHECK-NEXT:    lxvd2x 56, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 224
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+  %v = load <8 x double>, ptr %p1, align 64
+  %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %v)
+  store <8 x double> %res, ptr %p2, align 64
+  ret void
+}
+
+declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
index 232014db9a01..a9503f77c309 100644
--- a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -2,22 +2,87 @@
 ; Verify whether the generated assembly for the following function includes the mtvsrbmi instruction.
 ; vector unsigned char v00FF()
 ; {
-; vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
-; return x;
+;   vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
+;   return x;
+; }
+; vector unsigned short short00FF()
+; {
+;   vector unsigned short x = { 0xFF, 0,0,0, 0,0,0,0};
+;   return x;
+; }
+; vector unsigned int int00FF()
+; {
+;   vector unsigned int x = { 0xFF, 0,0,0};
+;   return x;
+; }
+; vector unsigned long long  longlong00FF()
+; {
+;   vector unsigned long long x = { 0xFF, 0};
+;   return x;
 ; }
 
 ; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc-ibm-aix -mcpu=pwr10  -verify-machineinstrs \
-; RUN:   | FileCheck %s --check-prefix=CHECK
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+
+; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr10  -verify-machineinstrs \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+
+; CHECK-NOT:   .byte   255
+; CHECK-NOT:   .byte   0
 
 define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
-; CHECK-NOT:      L..CPI0_0:
-; CHECK-NOT:   .byte   255                             # 0xff
-; CHECK-NOT:   .byte   0                               # 0x0
-
-; CHECK-LABEL: _Z5v00FFv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mtvsrbmi v2, 1
-; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: _Z5v00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 32768
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z5v00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+
 entry:
   ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
 }
+
+define dso_local noundef range(i16 0, 256) <8 x i16> @_Z9short00FFv() {
+; CHECK-BE-LABEL: _Z9short00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 16384
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z9short00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <8 x i16> <i16 255, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+define dso_local noundef range(i32 0, 256) <4 x i32> @_Z7int00FFv() {
+; CHECK-BE-LABEL: _Z7int00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 4096
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z7int00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <4 x i32> <i32 255, i32 0, i32 0, i32 0>
+}
+
+define dso_local noundef range(i64 0, 256) <2 x i64> @_Z12longlong00FFv() {
+; CHECK-BE-LABEL: _Z12longlong00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 256
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z12longlong00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <2 x i64> <i64 255, i64 0>
+}
diff --git a/llvm/test/CodeGen/PowerPC/nofpclass.ll b/llvm/test/CodeGen/PowerPC/nofpclass.ll
new file mode 100644
index 000000000000..b08e810cd1cc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/nofpclass.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-ibm-aix-xcoff < %s | FileCheck %s
+
+; TODO: Update this test after adding the proper expansion of nofpclass for
+; ppc_fp128 to test with more masks and to demonstrate preserving nofpclass
+; after legalization.
+
+define ppc_fp128 @f(ppc_fp128 nofpclass(nan) %s) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  ret ppc_fp128 %s
+}
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd00dcd0..b540948b20f7 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -448(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT:    stdu r1, -512(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT:    .cfi_offset r14, -144
+; CHECK-PWR7-NEXT:    .cfi_offset r15, -136
+; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
+; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
+; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
 ; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
 ; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
 ; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r28, -32
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
-; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    .cfi_offset r31, -8
+; CHECK-PWR7-NEXT:    .cfi_offset r2, -152
 ; CHECK-PWR7-NEXT:    addi r3, r1, 320
-; CHECK-PWR7-NEXT:    lbz r7, 304(r1)
-; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    lbz r8, 320(r1)
-; CHECK-PWR7-NEXT:    lbz r9, 305(r1)
-; CHECK-PWR7-NEXT:    lbz r10, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r26, 325(r1)
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    lbz r11, 306(r1)
-; CHECK-PWR7-NEXT:    lbz r12, 322(r1)
-; CHECK-PWR7-NEXT:    lbz r23, 314(r1)
-; CHECK-PWR7-NEXT:    clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT:    lbz r26, 330(r1)
-; CHECK-PWR7-NEXT:    sub r8, r7, r8
-; CHECK-PWR7-NEXT:    lbz r7, 315(r1)
-; CHECK-PWR7-NEXT:    sub r20, r9, r10
-; CHECK-PWR7-NEXT:    lbz r9, 331(r1)
-; CHECK-PWR7-NEXT:    lbz r0, 307(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 323(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT:    clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT:    clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT:    clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT:    lbz r29, 308(r1)
-; CHECK-PWR7-NEXT:    lbz r28, 324(r1)
-; CHECK-PWR7-NEXT:    lbz r27, 309(r1)
-; CHECK-PWR7-NEXT:    lbz r25, 310(r1)
-; CHECK-PWR7-NEXT:    lbz r24, 326(r1)
-; CHECK-PWR7-NEXT:    sub r19, r11, r12
-; CHECK-PWR7-NEXT:    sub r11, r23, r21
-; CHECK-PWR7-NEXT:    sub r9, r7, r9
-; CHECK-PWR7-NEXT:    sub r26, r0, r30
-; CHECK-PWR7-NEXT:    srawi r12, r11, 31
-; CHECK-PWR7-NEXT:    srawi r0, r9, 31
-; CHECK-PWR7-NEXT:    lbz r3, 312(r1)
-; CHECK-PWR7-NEXT:    clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT:    clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT:    clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT:    clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT:    clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT:    xor r11, r11, r12
-; CHECK-PWR7-NEXT:    xor r9, r9, r0
-; CHECK-PWR7-NEXT:    sub r28, r29, r28
-; CHECK-PWR7-NEXT:    sub r30, r27, r22
-; CHECK-PWR7-NEXT:    sub r29, r25, r24
-; CHECK-PWR7-NEXT:    sub r27, r11, r12
-; CHECK-PWR7-NEXT:    sub r24, r9, r0
-; CHECK-PWR7-NEXT:    lbz r9, 316(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 332(r1)
-; CHECK-PWR7-NEXT:    lbz r4, 328(r1)
-; CHECK-PWR7-NEXT:    lbz r5, 311(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 327(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT:    clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT:    clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT:    clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
+; CHECK-PWR7-NEXT:    std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    lbz r3, 320(r1)
+; CHECK-PWR7-NEXT:    addi r4, r1, 336
+; CHECK-PWR7-NEXT:    stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT:    lbz r15, 334(r1)
+; CHECK-PWR7-NEXT:    lbz r14, 350(r1)
+; CHECK-PWR7-NEXT:    lbz r31, 335(r1)
+; CHECK-PWR7-NEXT:    lbz r2, 351(r1)
+; CHECK-PWR7-NEXT:    sub r15, r15, r14
+; CHECK-PWR7-NEXT:    sub r14, r31, r2
+; CHECK-PWR7-NEXT:    srawi r2, r14, 31
+; CHECK-PWR7-NEXT:    xor r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r3, 333(r1)
+; CHECK-PWR7-NEXT:    lbz r19, 331(r1)
+; CHECK-PWR7-NEXT:    lbz r18, 347(r1)
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    lbz r17, 332(r1)
+; CHECK-PWR7-NEXT:    lbz r16, 348(r1)
+; CHECK-PWR7-NEXT:    sub r17, r17, r16
+; CHECK-PWR7-NEXT:    lbz r23, 329(r1)
+; CHECK-PWR7-NEXT:    sub r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r2, 349(r1)
+; CHECK-PWR7-NEXT:    lbz r22, 345(r1)
+; CHECK-PWR7-NEXT:    lbz r4, 336(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 337(r1)
+; CHECK-PWR7-NEXT:    lbz r7, 322(r1)
+; CHECK-PWR7-NEXT:    lbz r8, 338(r1)
+; CHECK-PWR7-NEXT:    lbz r9, 323(r1)
+; CHECK-PWR7-NEXT:    lbz r10, 339(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 324(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 340(r1)
+; CHECK-PWR7-NEXT:    lbz r0, 325(r1)
+; CHECK-PWR7-NEXT:    lbz r30, 341(r1)
+; CHECK-PWR7-NEXT:    lbz r29, 326(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 342(r1)
+; CHECK-PWR7-NEXT:    lbz r27, 327(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 343(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r2
+; CHECK-PWR7-NEXT:    lbz r25, 328(r1)
+; CHECK-PWR7-NEXT:    lbz r24, 344(r1)
+; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
+; CHECK-PWR7-NEXT:    lbz r20, 346(r1)
 ; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    srawi r4, r3, 31
+; CHECK-PWR7-NEXT:    srawi r18, r3, 31
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    srawi r31, r15, 31
+; CHECK-PWR7-NEXT:    ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r3, r3, r18
 ; CHECK-PWR7-NEXT:    srawi r6, r5, 31
-; CHECK-PWR7-NEXT:    xor r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r27, r27, 56
-; CHECK-PWR7-NEXT:    xor r5, r5, r6
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r24, r24, 56
+; CHECK-PWR7-NEXT:    srawi r8, r7, 31
+; CHECK-PWR7-NEXT:    srawi r10, r9, 31
+; CHECK-PWR7-NEXT:    srawi r12, r11, 31
+; CHECK-PWR7-NEXT:    srawi r30, r0, 31
+; CHECK-PWR7-NEXT:    sub r3, r3, r18
+; CHECK-PWR7-NEXT:    srawi r18, r19, 31
+; CHECK-PWR7-NEXT:    srawi r28, r29, 31
+; CHECK-PWR7-NEXT:    ld r16, 384(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r27, 208(r1)
-; CHECK-PWR7-NEXT:    sub r4, r5, r6
-; CHECK-PWR7-NEXT:    std r27, 216(r1)
-; CHECK-PWR7-NEXT:    srawi r27, r29, 31
-; CHECK-PWR7-NEXT:    lbz r10, 313(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r24, 224(r1)
-; CHECK-PWR7-NEXT:    lbz r22, 329(r1)
-; CHECK-PWR7-NEXT:    std r24, 232(r1)
-; CHECK-PWR7-NEXT:    srawi r24, r30, 31
-; CHECK-PWR7-NEXT:    ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r23, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 317(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 333(r1)
-; CHECK-PWR7-NEXT:    xor r29, r29, r27
-; CHECK-PWR7-NEXT:    std r3, 176(r1)
-; CHECK-PWR7-NEXT:    std r3, 184(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sldi r23, r23, 56
-; CHECK-PWR7-NEXT:    xor r30, r30, r24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r4, r30, r24
-; CHECK-PWR7-NEXT:    ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 160(r1)
-; CHECK-PWR7-NEXT:    std r3, 168(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r29, r27
-; CHECK-PWR7-NEXT:    std r23, 240(r1)
-; CHECK-PWR7-NEXT:    ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r23, 248(r1)
-; CHECK-PWR7-NEXT:    ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r23, r28, 31
+; CHECK-PWR7-NEXT:    srawi r26, r27, 31
+; CHECK-PWR7-NEXT:    srawi r24, r25, 31
+; CHECK-PWR7-NEXT:    xor r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r5, r5, r6
+; CHECK-PWR7-NEXT:    std r3, 272(r1)
+; CHECK-PWR7-NEXT:    std r3, 280(r1)
+; CHECK-PWR7-NEXT:    srawi r3, r17, 31
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r17, r17, r3
+; CHECK-PWR7-NEXT:    xor r9, r9, r10
+; CHECK-PWR7-NEXT:    xor r11, r11, r12
+; CHECK-PWR7-NEXT:    xor r0, r0, r30
+; CHECK-PWR7-NEXT:    xor r29, r29, r28
+; CHECK-PWR7-NEXT:    xor r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r3, r17, r3
+; CHECK-PWR7-NEXT:    xor r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    xor r28, r28, r23
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 144(r1)
-; CHECK-PWR7-NEXT:    ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 152(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sub r25, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 318(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 334(r1)
-; CHECK-PWR7-NEXT:    std r3, 128(r1)
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    sldi r14, r14, 56
+; CHECK-PWR7-NEXT:    sldi r15, r15, 56
+; CHECK-PWR7-NEXT:    ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 256(r1)
+; CHECK-PWR7-NEXT:    std r3, 264(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r19, 56
 ; CHECK-PWR7-NEXT:    sldi r25, r25, 56
-; CHECK-PWR7-NEXT:    std r3, 136(r1)
-; CHECK-PWR7-NEXT:    sub r3, r28, r23
+; CHECK-PWR7-NEXT:    sldi r27, r27, 56
+; CHECK-PWR7-NEXT:    std r3, 240(r1)
+; CHECK-PWR7-NEXT:    std r3, 248(r1)
+; CHECK-PWR7-NEXT:    sub r3, r23, r22
+; CHECK-PWR7-NEXT:    srawi r23, r3, 31
+; CHECK-PWR7-NEXT:    sub r22, r21, r20
+; CHECK-PWR7-NEXT:    srawi r21, r22, 31
+; CHECK-PWR7-NEXT:    sldi r29, r29, 56
+; CHECK-PWR7-NEXT:    sldi r0, r0, 56
+; CHECK-PWR7-NEXT:    sldi r11, r11, 56
+; CHECK-PWR7-NEXT:    xor r3, r3, r23
+; CHECK-PWR7-NEXT:    xor r22, r22, r21
+; CHECK-PWR7-NEXT:    sldi r9, r9, 56
+; CHECK-PWR7-NEXT:    sldi r7, r7, 56
+; CHECK-PWR7-NEXT:    sldi r5, r5, 56
+; CHECK-PWR7-NEXT:    ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r3, r3, r23
+; CHECK-PWR7-NEXT:    sub r22, r22, r21
+; CHECK-PWR7-NEXT:    std r14, 304(r1)
+; CHECK-PWR7-NEXT:    ld r26, 464(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 112(r1)
-; CHECK-PWR7-NEXT:    ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    std r25, 256(r1)
-; CHECK-PWR7-NEXT:    std r25, 264(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    srawi r25, r26, 31
-; CHECK-PWR7-NEXT:    xor r26, r26, r25
-; CHECK-PWR7-NEXT:    ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r3, 120(r1)
-; CHECK-PWR7-NEXT:    sub r4, r26, r25
-; CHECK-PWR7-NEXT:    clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT:    srawi r7, r8, 31
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    srawi r22, r10, 31
-; CHECK-PWR7-NEXT:    xor r8, r8, r7
-; CHECK-PWR7-NEXT:    xor r10, r10, r22
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r12, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 319(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 335(r1)
-; CHECK-PWR7-NEXT:    std r3, 96(r1)
-; CHECK-PWR7-NEXT:    sldi r12, r12, 56
-; CHECK-PWR7-NEXT:    std r3, 104(r1)
-; CHECK-PWR7-NEXT:    ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sldi r10, r10, 56
-; CHECK-PWR7-NEXT:    std r10, 192(r1)
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r12, 272(r1)
-; CHECK-PWR7-NEXT:    std r12, 280(r1)
-; CHECK-PWR7-NEXT:    srawi r12, r19, 31
-; CHECK-PWR7-NEXT:    xor r0, r19, r12
-; CHECK-PWR7-NEXT:    ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r3, r0, r12
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r10, 200(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    sldi r22, r22, 56
+; CHECK-PWR7-NEXT:    ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r14, 312(r1)
+; CHECK-PWR7-NEXT:    std r15, 288(r1)
+; CHECK-PWR7-NEXT:    std r3, 208(r1)
+; CHECK-PWR7-NEXT:    std r3, 216(r1)
+; CHECK-PWR7-NEXT:    lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r15, 296(r1)
+; CHECK-PWR7-NEXT:    ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r22, 224(r1)
+; CHECK-PWR7-NEXT:    std r22, 232(r1)
+; CHECK-PWR7-NEXT:    sub r4, r3, r4
+; CHECK-PWR7-NEXT:    std r25, 192(r1)
+; CHECK-PWR7-NEXT:    ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r3, r4, 31
+; CHECK-PWR7-NEXT:    std r25, 200(r1)
+; CHECK-PWR7-NEXT:    ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r27, 176(r1)
+; CHECK-PWR7-NEXT:    std r27, 184(r1)
+; CHECK-PWR7-NEXT:    xor r4, r4, r3
+; CHECK-PWR7-NEXT:    std r29, 160(r1)
+; CHECK-PWR7-NEXT:    ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r29, 168(r1)
+; CHECK-PWR7-NEXT:    std r0, 144(r1)
+; CHECK-PWR7-NEXT:    sub r3, r4, r3
+; CHECK-PWR7-NEXT:    std r0, 152(r1)
+; CHECK-PWR7-NEXT:    ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r18, 400(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 80(r1)
-; CHECK-PWR7-NEXT:    std r3, 88(r1)
-; CHECK-PWR7-NEXT:    sldi r9, r9, 56
-; CHECK-PWR7-NEXT:    std r9, 288(r1)
-; CHECK-PWR7-NEXT:    std r9, 296(r1)
-; CHECK-PWR7-NEXT:    srawi r9, r20, 31
-; CHECK-PWR7-NEXT:    xor r11, r20, r9
-; CHECK-PWR7-NEXT:    ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r4, r11, r9
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    std r11, 128(r1)
+; CHECK-PWR7-NEXT:    ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r11, 136(r1)
+; CHECK-PWR7-NEXT:    std r9, 112(r1)
 ; CHECK-PWR7-NEXT:    std r3, 64(r1)
 ; CHECK-PWR7-NEXT:    std r3, 72(r1)
-; CHECK-PWR7-NEXT:    sub r3, r8, r7
-; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 48(r1)
-; CHECK-PWR7-NEXT:    std r3, 56(r1)
-; CHECK-PWR7-NEXT:    addi r3, r1, 288
+; CHECK-PWR7-NEXT:    addi r3, r1, 304
+; CHECK-PWR7-NEXT:    std r9, 120(r1)
+; CHECK-PWR7-NEXT:    ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r7, 96(r1)
+; CHECK-PWR7-NEXT:    std r7, 104(r1)
+; CHECK-PWR7-NEXT:    std r5, 80(r1)
+; CHECK-PWR7-NEXT:    std r5, 88(r1)
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    addi r3, r1, 288
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 256
+; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    ld r14, 368(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 240
+; CHECK-PWR7-NEXT:    addi r3, r1, 256
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 224
+; CHECK-PWR7-NEXT:    addi r3, r1, 240
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 208
+; CHECK-PWR7-NEXT:    addi r3, r1, 224
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 192
+; CHECK-PWR7-NEXT:    addi r3, r1, 208
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 176
+; CHECK-PWR7-NEXT:    addi r3, r1, 192
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 160
+; CHECK-PWR7-NEXT:    addi r3, r1, 176
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs0, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 144
+; CHECK-PWR7-NEXT:    addi r3, r1, 160
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 128
+; CHECK-PWR7-NEXT:    addi r3, r1, 144
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 112
+; CHECK-PWR7-NEXT:    addi r3, r1, 128
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 80
+; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 64
+; CHECK-PWR7-NEXT:    addi r3, r1, 80
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 48
+; CHECK-PWR7-NEXT:    addi r3, r1, 64
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs1, v3, v2
 ; CHECK-PWR7-NEXT:    xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT:    addi r1, r1, 448
+; CHECK-PWR7-NEXT:    addi r1, r1, 512
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/tls-picgot.ll b/llvm/test/CodeGen/PowerPC/tls-picgot.ll
new file mode 100644
index 000000000000..6562d864d1ba
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/tls-picgot.ll
@@ -0,0 +1,31 @@
+; RUN: llc -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s
+
+target triple = "powerpc-unknown-linux-gnu"
+
+; Test that LR is preserved when PPC32PICGOT clobbers it with a local "bl".
+
+@TLS = external thread_local global i8
+
+; CHECK-LABEL: tls_addr:
+; CHECK:        mflr [[SAVED_REG:[0-9]+]]
+
+; CHECK:        bl [[JUMP:\.L[[:alnum:]_]+]]
+; CHECK-NEXT:   [[OFFSET:\.L[[:alnum:]_]+]]:
+; CHECK-NEXT:   .long _GLOBAL_OFFSET_TABLE_-[[OFFSET]]
+; CHECK-NEXT:   [[JUMP]]
+; CHECK-NEXT:   mflr {{[0-9]+}}
+
+; CHECK:        mtlr [[SAVED_REG]]
+; CHECK-NEXT:   blr
+
+define ptr @tls_addr() unnamed_addr {
+  %1 = call ptr @llvm.threadlocal.address.p0(ptr @TLS)
+  ret ptr %1
+}
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 8, !"PIC Level", i32 2}
+!1 = !{i32 7, !"PIE Level", i32 2}
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a614d6a..117e3e4aac45 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 8(sp)
-; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a0, 20(sp)
+; RV32IF-NEXT:    lw a1, 8(sp)
+; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a2, .LBB47_2
+; RV32IF-NEXT:    beqz a0, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a2, 0
+; RV32IF-NEXT:    slti a4, a0, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a2
+; RV32IF-NEXT:    or a3, a3, a0
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
+; RV32IF-NEXT:    and a2, a3, a2
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    slti a2, a2, 0
-; RV32IF-NEXT:    addi a2, a2, -1
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    and a1, a2, a1
+; RV32IF-NEXT:    slti a0, a0, 0
+; RV32IF-NEXT:    addi a3, a0, -1
+; RV32IF-NEXT:    and a0, a3, a1
+; RV32IF-NEXT:    and a1, a3, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a0, 20(sp)
+; RV32IFD-NEXT:    lw a1, 8(sp)
+; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a2, .LBB47_2
+; RV32IFD-NEXT:    beqz a0, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a2, 0
+; RV32IFD-NEXT:    slti a4, a0, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a2
+; RV32IFD-NEXT:    or a3, a3, a0
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a2, a3, a2
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    slti a2, a2, 0
-; RV32IFD-NEXT:    addi a2, a2, -1
-; RV32IFD-NEXT:    and a0, a2, a0
-; RV32IFD-NEXT:    and a1, a2, a1
+; RV32IFD-NEXT:    slti a0, a0, 0
+; RV32IFD-NEXT:    addi a3, a0, -1
+; RV32IFD-NEXT:    and a0, a3, a1
+; RV32IFD-NEXT:    and a1, a3, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB50_2
+; RV32-NEXT:    beqz a0, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB53_2
+; RV32-NEXT:    beqz a0, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index b1a6d163664e..97d102561129 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
 define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vmv.v.i v11, -1
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    li a1, -1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vmacc.vv v8, v10, v11
-; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT:    vmadd.vx v10, a1, v8
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ;
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vmv.v.i v11, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v10, v11
-; RV64-NEXT:    vmv.v.i v9, 0
-; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT:    vmadd.vx v10, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ;
 ; RV64-LABEL: ctz_nxv8i1_no_range:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vmv.v.i v24, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v16, v24
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT:    vmadd.vx v16, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/pr148084.ll b/llvm/test/CodeGen/RISCV/pr148084.ll
new file mode 100644
index 000000000000..9fa26c74021c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr148084.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+source_filename = "external/libaom/av1/encoder/tx_search.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-android10000"
+
+define fastcc void @search_tx_type() #0 {
+; CHECK-LABEL: search_tx_type:
+; CHECK:       # %bb.0: # %._crit_edge.i
+; CHECK-NEXT:  # %bb.1: # %bb
+; CHECK-NEXT:    lbu a1, 0(zero)
+; CHECK-NEXT:    lw a0, 0(zero)
+; CHECK-NEXT:    lh a2, 0(zero)
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    srai a3, a0, 63
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    andi a2, a1, 1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    or a3, a3, a0
+; CHECK-NEXT:    or a2, a2, a3
+; CHECK-NEXT:    bgez a2, .LBB0_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    bexti a3, a1, 1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:  .LBB0_3: # %bb
+; CHECK-NEXT:    andi a4, a1, 4
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a4, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_5: # %bb
+; CHECK-NEXT:    blt a2, a0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_7: # %bb
+; CHECK-NEXT:    andi a5, a1, 8
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_9: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_11
+; CHECK-NEXT:  # %bb.10: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_11: # %bb
+; CHECK-NEXT:    andi a5, a1, 16
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_13
+; CHECK-NEXT:  # %bb.12: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_13: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_15
+; CHECK-NEXT:  # %bb.14: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_15: # %bb
+; CHECK-NEXT:    andi a5, a1, 32
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_17
+; CHECK-NEXT:  # %bb.16: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_17: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_19
+; CHECK-NEXT:  # %bb.18: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_19: # %bb
+; CHECK-NEXT:    andi a5, a1, 64
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_21
+; CHECK-NEXT:  # %bb.20: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_21: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_23
+; CHECK-NEXT:  # %bb.22: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_23: # %bb
+; CHECK-NEXT:    andi a5, a1, 128
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_25
+; CHECK-NEXT:  # %bb.24: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_25: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_27
+; CHECK-NEXT:  # %bb.26: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_27: # %bb
+; CHECK-NEXT:    andi a5, a1, 256
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_29
+; CHECK-NEXT:  # %bb.28: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_29: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_31
+; CHECK-NEXT:  # %bb.30: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_31: # %bb
+; CHECK-NEXT:    andi a5, a1, 512
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_33
+; CHECK-NEXT:  # %bb.32: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_33: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_35
+; CHECK-NEXT:  # %bb.34: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_35: # %bb
+; CHECK-NEXT:    andi a5, a1, 1024
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_37
+; CHECK-NEXT:  # %bb.36: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_37: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_39
+; CHECK-NEXT:  # %bb.38: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_39: # %bb
+; CHECK-NEXT:    slli a5, a1, 52
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    bgez a5, .LBB0_41
+; CHECK-NEXT:  # %bb.40: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_41: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_43
+; CHECK-NEXT:  # %bb.42: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_43: # %bb
+; CHECK-NEXT:    slli a4, a1, 51
+; CHECK-NEXT:    sext.w a3, a2
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    bltz a4, .LBB0_49
+; CHECK-NEXT:  # %bb.44: # %bb
+; CHECK-NEXT:    bge a3, a0, .LBB0_50
+; CHECK-NEXT:  .LBB0_45: # %bb
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    blt a2, a0, .LBB0_47
+; CHECK-NEXT:  .LBB0_46: # %bb
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB0_47: # %bb
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:  # %bb.48: # %get_tx_mask.exit
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_49: # %bb
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    blt a3, a0, .LBB0_45
+; CHECK-NEXT:  .LBB0_50: # %bb
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    sext.w a2, a2
+; CHECK-NEXT:    bge a2, a0, .LBB0_46
+; CHECK-NEXT:    j .LBB0_47
+._crit_edge.i:
+  %.in196.i = load i16, ptr null, align 2
+  %i2 = load i16, ptr null, align 2
+  %i3 = and i16 %i2, %.in196.i
+  %i9 = trunc nuw i8 0 to i1
+  br i1 %i9, label %get_tx_mask.exit, label %bb
+
+bb:                                               ; preds = %._crit_edge.i
+  %i13 = load i8, ptr null, align 1
+  %i14 = icmp eq i8 %i13, 0
+  %spec.select211.i = select i1 %i14, i16 0, i16 %i3
+  %i19 = load i32, ptr null, align 4
+  %i20 = zext i16 %spec.select211.i to i32
+  %i21 = load i32, ptr null, align 4
+  %i22 = icmp sgt i32 %i21, -1
+  %i23 = and i32 %i20, 1
+  %.not203.i = icmp eq i32 %i23, 0
+  %spec.select212.i = select i1 %.not203.i, i32 -1, i32 %i21
+  %.1174.i = select i1 %i22, i32 %spec.select212.i, i32 -1
+  %i28 = icmp sgt i32 0, %.1174.i
+  %i29 = and i32 %i20, 2
+  %.not203.1.not.i = icmp eq i32 %i29, 0
+  %spec.select212.1.i = select i1 %.not203.1.not.i, i32 %.1174.i, i32 0
+  %.1174.1.i = select i1 %i28, i32 %spec.select212.1.i, i32 %.1174.i
+  %i30 = load i32, ptr null, align 4
+  %i31 = icmp sgt i32 %i30, %.1174.1.i
+  %i32 = and i32 %i20, 4
+  %.not203.2.i = icmp eq i32 %i32, 0
+  %spec.select212.2.i = select i1 %.not203.2.i, i32 %.1174.1.i, i32 %i30
+  %.1174.2.i = select i1 %i31, i32 %spec.select212.2.i, i32 %.1174.1.i
+  %i36 = load i32, ptr null, align 4
+  %i37 = icmp sgt i32 %i36, %.1174.2.i
+  %i38 = and i32 %i20, 8
+  %.not203.3.i = icmp eq i32 %i38, 0
+  %spec.select212.3.i = select i1 %.not203.3.i, i32 %.1174.2.i, i32 %i36
+  %.1174.3.i = select i1 %i37, i32 %spec.select212.3.i, i32 %.1174.2.i
+  %i42 = load i32, ptr null, align 4
+  %i43 = icmp sgt i32 %i42, %.1174.3.i
+  %i44 = and i32 %i20, 16
+  %.not203.4.i = icmp eq i32 %i44, 0
+  %spec.select212.4.i = select i1 %.not203.4.i, i32 %.1174.3.i, i32 %i42
+  %.1174.4.i = select i1 %i43, i32 %spec.select212.4.i, i32 %.1174.3.i
+  %i48 = load i32, ptr null, align 4
+  %i49 = icmp sgt i32 %i48, %.1174.4.i
+  %i50 = and i32 %i20, 32
+  %.not203.5.i = icmp eq i32 %i50, 0
+  %spec.select212.5.i = select i1 %.not203.5.i, i32 %.1174.4.i, i32 %i48
+  %.1174.5.i = select i1 %i49, i32 %spec.select212.5.i, i32 %.1174.4.i
+  %i51 = load i32, ptr null, align 4
+  %i52 = icmp sgt i32 %i51, %.1174.5.i
+  %i53 = and i32 %i20, 64
+  %.not203.6.i = icmp eq i32 %i53, 0
+  %spec.select212.6.i = select i1 %.not203.6.i, i32 %.1174.5.i, i32 %i51
+  %.1174.6.i = select i1 %i52, i32 %spec.select212.6.i, i32 %.1174.5.i
+  %i56 = load i32, ptr null, align 4
+  %i57 = icmp sgt i32 %i56, %.1174.6.i
+  %i58 = and i32 %i20, 128
+  %.not203.7.i = icmp eq i32 %i58, 0
+  %spec.select212.7.i = select i1 %.not203.7.i, i32 %.1174.6.i, i32 %i56
+  %.1174.7.i = select i1 %i57, i32 %spec.select212.7.i, i32 %.1174.6.i
+  %i60 = load i32, ptr null, align 4
+  %i61 = icmp sgt i32 %i60, %.1174.7.i
+  %i62 = and i32 %i20, 256
+  %.not203.8.i = icmp eq i32 %i62, 0
+  %spec.select212.8.i = select i1 %.not203.8.i, i32 %.1174.7.i, i32 %i60
+  %.1174.8.i = select i1 %i61, i32 %spec.select212.8.i, i32 %.1174.7.i
+  %i63 = load i32, ptr null, align 4
+  %i64 = icmp sgt i32 %i63, %.1174.8.i
+  %i65 = and i32 %i20, 512
+  %.not203.9.i = icmp eq i32 %i65, 0
+  %spec.select212.9.i = select i1 %.not203.9.i, i32 %.1174.8.i, i32 %i63
+  %.1174.9.i = select i1 %i64, i32 %spec.select212.9.i, i32 %.1174.8.i
+  %i67 = load i32, ptr null, align 4
+  %i68 = icmp sgt i32 %i67, %.1174.9.i
+  %i69 = and i32 %i20, 1024
+  %.not203.10.i = icmp eq i32 %i69, 0
+  %spec.select212.10.i = select i1 %.not203.10.i, i32 %.1174.9.i, i32 %i67
+  %.1174.10.i = select i1 %i68, i32 %spec.select212.10.i, i32 %.1174.9.i
+  %i70 = load i32, ptr null, align 4
+  %i71 = icmp sgt i32 %i70, %.1174.10.i
+  %i72 = and i32 %i20, 2048
+  %.not203.11.i = icmp eq i32 %i72, 0
+  %spec.select212.11.i = select i1 %.not203.11.i, i32 %.1174.10.i, i32 %i70
+  %.1174.11.i = select i1 %i71, i32 %spec.select212.11.i, i32 %.1174.10.i
+  %i75 = load i32, ptr null, align 4
+  %i76 = icmp sgt i32 %i75, %.1174.11.i
+  %i77 = and i32 %i20, 4096
+  %.not203.12.i = icmp eq i32 %i77, 0
+  %spec.select212.12.i = select i1 %.not203.12.i, i32 %.1174.11.i, i32 %i75
+  %.1174.12.i = select i1 %i76, i32 %spec.select212.12.i, i32 %.1174.11.i
+  %i80 = load i32, ptr null, align 4
+  %i81 = icmp sgt i32 %i80, %.1174.12.i
+  %spec.select212.13.i = select i1 false, i32 %.1174.12.i, i32 %i80
+  %.1174.13.i = select i1 %i81, i32 %spec.select212.13.i, i32 %.1174.12.i
+  %.1172.13.i = select i1 %i81, i32 13, i32 0
+  %i84 = icmp sgt i32 0, %.1174.13.i
+  %.1172.14.i = select i1 %i84, i32 14, i32 %.1172.13.i
+  %i88 = icmp slt i32 0, %i19
+  %i89 = select i1 %i88, i16 -32768, i16 0
+  %i90 = zext i16 %i89 to i32
+  %i91 = shl nuw nsw i32 1, %.1172.14.i
+  %i92 = and i32 %i91, %i90
+  %.not200.i = icmp eq i32 %i92, 0
+  %i93 = trunc nuw i32 %i91 to i16
+  %i94 = xor i16 %i93, -1
+  %i95 = select i1 %.not200.i, i16 -1, i16 %i94
+  %.2177.i = and i16 %i95, %i89
+  %i96 = xor i16 %.2177.i, -1
+  %i97 = and i16 %spec.select211.i, %i96
+  br label %get_tx_mask.exit
+
+get_tx_mask.exit:                                 ; preds = %._crit_edge.i, %bb
+  %.1261.i = phi i16 [ %i97, %bb ], [ 0, %._crit_edge.i ]
+  %i99 = icmp eq i16 %.1261.i, 0
+  %.2262.i = select i1 %i99, i16 0, i16 %.1261.i
+  ret void
+}
+
+attributes #0 = { noimplicitfloat nounwind sspstrong uwtable vscale_range(2,1024) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+unaligned-scalar-mem,+unaligned-vector-mem,+v,+zaamo,+zalrsc,+zba,+zbb,+zbs,+zca,+zcd,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zabha,-zacas,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zcb,-zce,-zcf,-zclsd,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccamoc,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zilsd,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
diff --git a/llvm/test/CodeGen/RISCV/pr153598.mir b/llvm/test/CodeGen/RISCV/pr153598.mir
new file mode 100644
index 000000000000..a084197fe83c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr153598.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -mattr=+zcmp -run-pass=riscv-move-merge -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: mov-merge
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x8, $x9
+    ; CHECK-LABEL: name: mov-merge
+    ; CHECK: liveins: $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = ADDI $x0, -3
+    ; CHECK-NEXT: SW renamable $x9, $x2, 56
+    ; CHECK-NEXT: CM_MVA01S killed renamable $x9, renamable $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: SW renamable $x8, $x2, 60
+    ; CHECK-NEXT: PseudoRET
+    $x12 = ADDI $x0, -3
+    SW renamable $x9, $x2, 56
+    $x10 = ADDI killed renamable $x9, 0
+    SW renamable $x8, $x2, 60
+    $x11 = ADDI killed renamable $x8, 0
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a050034c6316..a7eaf3979323 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -78,12 +78,12 @@ body: |
     ; CHECK-NEXT: %false:vrnov0 = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */
     %pt:vrnov0 = COPY $v8
     %false:vrnov0 = COPY $v9
     %mask:vmv0 = COPY $v0
-    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */
 ...
 ---
 # Shouldn't be converted because false operands are different
@@ -163,3 +163,47 @@ body: |
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
   bb.1:
     %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
+...
+---
+# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+name: preserve_false
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-LABEL: name: preserve_false
+    ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vr = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %avl1:gprnox0 = COPY $x8
+    %avl2:gprnox0 = COPY $x9
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+...
+---
+# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl.
+name: preserve_false_avl_known_le
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: preserve_false_avl_known_le
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vr = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
index 3aeb4e864627..9ffc84a8a0e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
@@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64>
   ret <vscale x 8 x i64> %1
 }
 
-declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) {
+; CHECK-LABEL: preserve_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vle32.v v10, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2)
+  ret <vscale x 2 x i32> %res
+}
+
+; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl.
+define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: preserve_false_avl_known_le:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1)
+  ret <vscale x 2 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
index f29c74ae69bf..697c582dcb38 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
@@ -21,7 +21,7 @@ define <vscale x 4 x i32> @intrinsic_vsha2cl_vv_nxv4i32_nxv4i32(<vscale x 4 x i3
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v10, v12
+; CHECK-NEXT:    vsha2cl.vv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsha2cl.nxv4i32.nxv4i32(
@@ -45,7 +45,7 @@ define <vscale x 8 x i32> @intrinsic_vsha2cl_vv_nxv8i32_nxv8i32(<vscale x 8 x i3
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v12, v16
+; CHECK-NEXT:    vsha2cl.vv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsha2cl.nxv8i32.nxv8i32(
@@ -70,7 +70,7 @@ define <vscale x 16 x i32> @intrinsic_vsha2cl_vv_nxv16i32_nxv16i32(<vscale x 16
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v16, v24
+; CHECK-NEXT:    vsha2cl.vv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsha2cl.nxv16i32.nxv16i32(
@@ -94,7 +94,7 @@ define <vscale x 4 x i64> @intrinsic_vsha2cl_vv_nxv4i64_nxv4i64(<vscale x 4 x i6
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v12, v16
+; CHECK-NEXT:    vsha2cl.vv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsha2cl.nxv4i64.nxv4i64(
@@ -119,7 +119,7 @@ define <vscale x 8 x i64> @intrinsic_vsha2cl_vv_nxv8i64_nxv8i64(<vscale x 8 x i6
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v16, v24
+; CHECK-NEXT:    vsha2cl.vv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsha2cl.nxv8i64.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
new file mode 100644
index 000000000000..9ab7f41045c4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV64I
+# RUN: llc -mtriple=riscv32 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV32I
+--- |
+  ; Function Attrs: uwtable
+  define void @no_reserved_call_frame(i64 %n) #0 {
+  entry:
+    %v = alloca i32, i64 %n, align 4
+    call void @callee_stack_args(ptr %v, [518 x i64] poison)
+    ret void
+  }
+
+  declare void @callee_stack_args(ptr, [518 x i64]) #1
+
+  attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+m" }
+  attributes #1 = { "target-features"="+m" }
+...
+---
+name:            no_reserved_call_frame
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x10', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: v, type: variable-sized, offset: 0, alignment: 1, stack-id: default,
+      callee-saved-register: '', callee-saved-restored: true, local-offset: 0,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  ; RV64I-LABEL: name: no_reserved_call_frame
+  ; RV64I: bb.0.entry:
+  ; RV64I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64I-NEXT:   liveins: $x10, $x1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV64I-NEXT:   frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.1)
+  ; RV64I-NEXT:   frame-setup SD killed $x8, $x2, 0 :: (store (s64) into %stack.2)
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -16
+  ; RV64I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV64I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV64I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV64I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV64I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV64I-NEXT:   renamable $x11 = LUI 1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.1.entry:
+  ; RV64I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV64I-NEXT:   SD $x0, $x2, 0
+  ; RV64I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.2.entry:
+  ; RV64I-NEXT:   liveins: $x10
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV64I-NEXT:   $x11 = LUI 1
+  ; RV64I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV64I-NEXT:   $x10 = LUI 1
+  ; RV64I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV64I-NEXT:   $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.1)
+  ; RV64I-NEXT:   $x8 = frame-destroy LD $x2, 0 :: (load (s64) from %stack.2)
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV64I-NEXT:   PseudoRET
+  ;
+  ; RV32I-LABEL: name: no_reserved_call_frame
+  ; RV32I: bb.0.entry:
+  ; RV32I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32I-NEXT:   liveins: $x10, $x1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV32I-NEXT:   frame-setup SW killed $x1, $x2, 12 :: (store (s32) into %stack.1)
+  ; RV32I-NEXT:   frame-setup SW killed $x8, $x2, 8 :: (store (s32) into %stack.2)
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -4
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -8
+  ; RV32I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV32I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV32I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV32I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV32I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV32I-NEXT:   renamable $x11 = LUI 1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.1.entry:
+  ; RV32I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV32I-NEXT:   SD $x0, $x2, 0
+  ; RV32I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.2.entry:
+  ; RV32I-NEXT:   liveins: $x10
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV32I-NEXT:   $x11 = LUI 1
+  ; RV32I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV32I-NEXT:   $x10 = LUI 1
+  ; RV32I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV32I-NEXT:   $x1 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.1)
+  ; RV32I-NEXT:   $x8 = frame-destroy LW $x2, 8 :: (load (s32) from %stack.2)
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV32I-NEXT:   PseudoRET
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $x10
+
+    renamable $x10 = SLLI killed renamable $x10, 2
+    renamable $x10 = nuw ADDI killed renamable $x10, 15
+    renamable $x10 = ANDI killed renamable $x10, -16
+    renamable $x10 = SUB $x2, killed renamable $x10
+    renamable $x11 = LUI 1
+
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x10, $x11
+
+    $x2 = SUB $x2, renamable $x11
+    SD $x0, $x2, 0
+    BLT renamable $x10, $x2, %bb.1
+
+  bb.2.entry:
+    liveins: $x10
+
+    $x2 = ADDI renamable $x10, 0
+    ADJCALLSTACKDOWN 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+    ADJCALLSTACKUP 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca382fc..cd7f30d8f589 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    add t0, t0, t3
-; RV32I-NEXT:    sw a4, 0(sp)
-; RV32I-NEXT:    sw a3, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(t0)
-; RV32I-NEXT:    lw a3, 8(t0)
-; RV32I-NEXT:    lw a4, 0(t0)
-; RV32I-NEXT:    lw a6, 12(t0)
-; RV32I-NEXT:    srl a7, a1, a0
-; RV32I-NEXT:    slli t0, a3, 1
-; RV32I-NEXT:    srl a4, a4, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t1, a6, 1
-; RV32I-NEXT:    srl a0, a6, a0
-; RV32I-NEXT:    sll a6, t0, a5
-; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    sll a5, t1, a5
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, t2, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a6, 16
-; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    addi t0, sp, 16
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sub a7, t0, t3
-; RV32I-NEXT:    sw a4, 16(sp)
-; RV32I-NEXT:    sw a3, 20(sp)
-; RV32I-NEXT:    sw a6, 24(sp)
-; RV32I-NEXT:    sw a1, 28(sp)
-; RV32I-NEXT:    lw a1, 0(a7)
-; RV32I-NEXT:    lw a3, 4(a7)
-; RV32I-NEXT:    lw a4, 8(a7)
-; RV32I-NEXT:    lw a6, 12(a7)
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    sll a7, a3, a0
-; RV32I-NEXT:    srli t0, a1, 1
-; RV32I-NEXT:    sll a6, a6, a0
-; RV32I-NEXT:    srli t1, a4, 1
-; RV32I-NEXT:    sll a4, a4, a0
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    sll a0, a1, a0
-; RV32I-NEXT:    srl a1, t0, a5
-; RV32I-NEXT:    srl t0, t1, a5
-; RV32I-NEXT:    srl a3, a3, a5
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli t1, a0, 24
-; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    addi t2, sp, 16
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    sub a0, t2, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    sll a7, a5, a1
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    srl a4, t0, a3
+; RV32I-NEXT:    srl t0, t1, a3
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t2, 1(a2)
 ; RV32I-NEXT:    sb a5, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a6, 16
-; RV32I-NEXT:    srli t0, a6, 24
-; RV32I-NEXT:    srli t1, a6, 8
-; RV32I-NEXT:    srli t2, a1, 16
-; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a0, 16
+; RV32I-NEXT:    srli t0, a0, 24
+; RV32I-NEXT:    srli t1, a0, 8
+; RV32I-NEXT:    srli t2, a4, 16
+; RV32I-NEXT:    srli t3, a4, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a1, 8
-; RV32I-NEXT:    sb a6, 12(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a4, 8
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t1, 13(a2)
 ; RV32I-NEXT:    sb a7, 14(a2)
 ; RV32I-NEXT:    sb t0, 15(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srli a4, a0, 3
-; RV32I-NEXT:    or a5, t1, a5
-; RV32I-NEXT:    andi t1, a0, 31
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srai t3, t4, 31
-; RV32I-NEXT:    andi a4, a4, 12
-; RV32I-NEXT:    xori t1, t1, 31
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or t3, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    mv a5, sp
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t2, a0, t2
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sw t3, 16(sp)
-; RV32I-NEXT:    sw t3, 20(sp)
-; RV32I-NEXT:    sw t3, 24(sp)
-; RV32I-NEXT:    sw t3, 28(sp)
-; RV32I-NEXT:    add a4, t0, a4
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or a7, t2, t0
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a5, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(a4)
-; RV32I-NEXT:    lw a3, 8(a4)
-; RV32I-NEXT:    lw a5, 0(a4)
-; RV32I-NEXT:    lw a4, 12(a4)
-; RV32I-NEXT:    srl a6, a1, a0
-; RV32I-NEXT:    slli a7, a3, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t0, a4, 1
-; RV32I-NEXT:    sra a0, a4, a0
-; RV32I-NEXT:    sll a4, a7, t1
-; RV32I-NEXT:    sll a1, a1, t1
-; RV32I-NEXT:    sll a7, t0, t1
+; RV32I-NEXT:    sw a6, 4(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, a5, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    srli a5, a3, 24
 ; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a4, 16
-; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
 ; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    mv a6, sp
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, a6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    srl a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    srl t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 44(sp)
 ; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 8
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t3, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    add s0, s0, s5
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s0)
-; RV32I-NEXT:    lw a4, 4(s0)
-; RV32I-NEXT:    lw a5, 8(s0)
-; RV32I-NEXT:    lw a6, 12(s0)
-; RV32I-NEXT:    lw a7, 16(s0)
-; RV32I-NEXT:    lw t0, 20(s0)
-; RV32I-NEXT:    lw t1, 24(s0)
-; RV32I-NEXT:    lw t2, 28(s0)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
 ; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    srl s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    addi s2, sp, 32
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    sub t2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    addi a6, sp, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a1, 56(sp)
-; RV64I-NEXT:    ld a1, 0(t2)
-; RV64I-NEXT:    ld a3, 8(t2)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a5, 24(t2)
-; RV64I-NEXT:    xori a6, s5, 63
-; RV64I-NEXT:    sll a7, a3, a0
-; RV64I-NEXT:    srli t0, a1, 1
-; RV64I-NEXT:    sll a5, a5, a0
-; RV64I-NEXT:    srli t1, a4, 1
-; RV64I-NEXT:    sll a4, a4, a0
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    sll t2, a1, a0
-; RV64I-NEXT:    srl a0, t0, a6
-; RV64I-NEXT:    srl a1, t1, a6
-; RV64I-NEXT:    srl a3, a3, a6
-; RV64I-NEXT:    srli a6, t2, 56
-; RV64I-NEXT:    srli t0, t2, 48
-; RV64I-NEXT:    srli t1, t2, 40
-; RV64I-NEXT:    srli t3, t2, 32
-; RV64I-NEXT:    srli t4, t2, 24
-; RV64I-NEXT:    srli t5, t2, 16
-; RV64I-NEXT:    srli t6, t2, 8
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a1, a5, a1
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    sb t3, 4(a2)
-; RV64I-NEXT:    sb t1, 5(a2)
-; RV64I-NEXT:    sb t0, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t6, 1(a2)
-; RV64I-NEXT:    sb t5, 2(a2)
-; RV64I-NEXT:    sb t4, 3(a2)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    sub a0, a6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    sll a7, a5, a1
+; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll t1, a0, a1
+; RV64I-NEXT:    srli a0, a6, 1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll a4, a4, a1
+; RV64I-NEXT:    srl a1, t0, a3
+; RV64I-NEXT:    srl t0, a0, a3
+; RV64I-NEXT:    srl a3, a5, a3
+; RV64I-NEXT:    srli a5, a4, 56
+; RV64I-NEXT:    srli t2, a4, 48
+; RV64I-NEXT:    srli t3, a4, 40
+; RV64I-NEXT:    srli t4, a4, 32
+; RV64I-NEXT:    srli t5, a4, 24
+; RV64I-NEXT:    srli t6, a4, 16
+; RV64I-NEXT:    srli s0, a4, 8
+; RV64I-NEXT:    or a0, a7, a1
+; RV64I-NEXT:    or a1, t1, t0
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t4, 4(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 6(a2)
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb s0, 1(a2)
+; RV64I-NEXT:    sb t6, 2(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 40
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 40
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw t3, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    sub t3, s0, s5
-; RV32I-NEXT:    sw a7, 56(sp)
-; RV32I-NEXT:    sw t0, 60(sp)
-; RV32I-NEXT:    sw t1, 64(sp)
-; RV32I-NEXT:    sw t2, 68(sp)
-; RV32I-NEXT:    sw a3, 40(sp)
-; RV32I-NEXT:    sw a4, 44(sp)
-; RV32I-NEXT:    sw a5, 48(sp)
-; RV32I-NEXT:    sw a6, 52(sp)
-; RV32I-NEXT:    lw a3, 0(t3)
-; RV32I-NEXT:    lw a4, 4(t3)
-; RV32I-NEXT:    lw a5, 8(t3)
-; RV32I-NEXT:    lw a6, 12(t3)
-; RV32I-NEXT:    lw a7, 16(t3)
-; RV32I-NEXT:    lw t0, 20(t3)
-; RV32I-NEXT:    lw t1, 24(t3)
-; RV32I-NEXT:    lw t2, 28(t3)
-; RV32I-NEXT:    sll t3, a4, a0
-; RV32I-NEXT:    srli t4, a3, 1
-; RV32I-NEXT:    sll t5, a6, a0
-; RV32I-NEXT:    srli t6, a5, 1
-; RV32I-NEXT:    sll a5, a5, a0
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    sll s0, t0, a0
-; RV32I-NEXT:    srli s1, a7, 1
-; RV32I-NEXT:    sll a7, a7, a0
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 4(a3)
+; RV32I-NEXT:    lw a6, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw t1, 20(a3)
+; RV32I-NEXT:    lw t2, 24(a3)
+; RV32I-NEXT:    lw a3, 28(a3)
+; RV32I-NEXT:    sll t3, a5, a0
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    sll t5, a7, a0
+; RV32I-NEXT:    srli t6, a6, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll s0, t1, a0
+; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    sll t0, t0, a0
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    sll s2, a3, a0
+; RV32I-NEXT:    srli a3, t2, 1
 ; RV32I-NEXT:    sll t2, t2, a0
-; RV32I-NEXT:    srli s2, t1, 1
-; RV32I-NEXT:    sll t1, t1, a0
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    sll s3, a3, a0
+; RV32I-NEXT:    srli t1, t1, 1
+; RV32I-NEXT:    sll s3, a4, a0
 ; RV32I-NEXT:    srl a0, t4, a1
-; RV32I-NEXT:    srl a3, t6, a1
-; RV32I-NEXT:    srl a4, a4, a1
+; RV32I-NEXT:    srl a4, t6, a1
+; RV32I-NEXT:    srl a5, a5, a1
 ; RV32I-NEXT:    srl t4, s1, a1
-; RV32I-NEXT:    srl a6, a6, a1
-; RV32I-NEXT:    srl t6, s2, a1
-; RV32I-NEXT:    srl t0, t0, a1
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    srl t6, a3, a1
+; RV32I-NEXT:    srl t1, t1, a1
 ; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s2, s3, 16
-; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    srli s4, s3, 16
+; RV32I-NEXT:    srli s5, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    or a1, t5, a3
-; RV32I-NEXT:    or a3, a5, a4
+; RV32I-NEXT:    or a1, t5, a4
+; RV32I-NEXT:    or a3, a6, a5
 ; RV32I-NEXT:    or a4, s0, t4
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a6, t2, t6
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, t6
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    sb s3, 0(a2)
-; RV32I-NEXT:    sb s4, 1(a2)
-; RV32I-NEXT:    sb s2, 2(a2)
+; RV32I-NEXT:    sb s5, 1(a2)
+; RV32I-NEXT:    sb s4, 2(a2)
 ; RV32I-NEXT:    sb s1, 3(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
 ; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    or a6, t2, t1
 ; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t2, t1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    sraiw t1, t1, 31
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t2, a1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd t1, 32(sp)
 ; RV64I-NEXT:    sd t1, 40(sp)
 ; RV64I-NEXT:    sd t1, 48(sp)
 ; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, s6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    sra a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    sra t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli s3, a0, 56
 ; RV64I-NEXT:    srli s4, a0, 48
 ; RV64I-NEXT:    srli s5, a0, 40
+; RV64I-NEXT:    srli s6, a0, 32
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a0, 32
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a3, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb t0, 19(a2)
-; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    srli a3, a0, 16
 ; RV64I-NEXT:    sb t6, 4(a2)
 ; RV64I-NEXT:    sb t5, 5(a2)
 ; RV64I-NEXT:    sb t4, 6(a2)
 ; RV64I-NEXT:    sb t3, 7(a2)
-; RV64I-NEXT:    srli a5, a0, 16
+; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
 ; RV64I-NEXT:    sb s2, 1(a2)
 ; RV64I-NEXT:    sb s1, 2(a2)
 ; RV64I-NEXT:    sb s0, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a4, 12(a2)
+; RV64I-NEXT:    sb s6, 12(a2)
 ; RV64I-NEXT:    sb s5, 13(a2)
 ; RV64I-NEXT:    sb s4, 14(a2)
 ; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a5, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    sb a5, 9(a2)
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    sb a4, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t6, 8(a0)
-; RV32I-NEXT:    lbu s0, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    lbu s9, 15(a0)
-; RV32I-NEXT:    lbu s10, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t4, t3
-; RV32I-NEXT:    or a7, s0, t6
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s4, 25(a0)
-; RV32I-NEXT:    lbu s5, 26(a0)
-; RV32I-NEXT:    lbu ra, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t6, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    lbu s6, 28(a0)
-; RV32I-NEXT:    lbu s7, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or s3, s9, s8
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    lbu s5, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, ra
+; RV32I-NEXT:    addi s8, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or s1, a0, s1
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, s2, t3
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t4, a3
 ; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s4, t3
-; RV32I-NEXT:    or s1, ra, s5
-; RV32I-NEXT:    or s4, s7, s6
-; RV32I-NEXT:    or s5, s9, s8
-; RV32I-NEXT:    srai s6, s9, 31
-; RV32I-NEXT:    andi s7, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t6, t4
-; RV32I-NEXT:    or a7, s2, s0
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, s1, t3
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    sw s6, 56(sp)
-; RV32I-NEXT:    sw s6, 60(sp)
-; RV32I-NEXT:    sw s6, 64(sp)
-; RV32I-NEXT:    sw s6, 68(sp)
-; RV32I-NEXT:    sw s6, 40(sp)
-; RV32I-NEXT:    sw s6, 44(sp)
-; RV32I-NEXT:    sw s6, 48(sp)
-; RV32I-NEXT:    sw s6, 52(sp)
-; RV32I-NEXT:    add s3, s3, s7
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
+; RV32I-NEXT:    or a0, a1, t6
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s3)
-; RV32I-NEXT:    lw a4, 4(s3)
-; RV32I-NEXT:    lw a5, 8(s3)
-; RV32I-NEXT:    lw a6, 12(s3)
-; RV32I-NEXT:    lw a7, 16(s3)
-; RV32I-NEXT:    lw t0, 20(s3)
-; RV32I-NEXT:    lw t1, 24(s3)
-; RV32I-NEXT:    lw t2, 28(s3)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
-; RV32I-NEXT:    sra t2, t2, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    sra s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index a3b4e7829a51..6fdc63fddbc3 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: pow2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: pow2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %mul = mul nsw i32 %b, 32
@@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 31
@@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladd64:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a2
@@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IZBAMXQCIAC-LABEL: shladd64:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IZBAMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a2
@@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 22
@@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladdc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IZBAMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shxaddc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 28
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shxaddc1c2:
@@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1c264:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IMXQCIAC-NEXT:    mv a0, a2
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c264:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IZBAMXQCIAC-NEXT:    mv a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shlc1 = shl nsw i32 %a, 12
@@ -463,3 +463,30 @@ entry:
   %add = add nsw i32 %shlc1, %shlc2
   ret i32 %add
 }
+
+define i32 @testmuliaddnegimm(i32 %a) {
+; RV32IM-LABEL: testmuliaddnegimm:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 1
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    li a1, 3
+; RV32IM-NEXT:    sub a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IMXQCIAC:       # %bb.0:
+; RV32IMXQCIAC-NEXT:    li a1, 3
+; RV32IMXQCIAC-NEXT:    qc.muliadd a1, a0, -3
+; RV32IMXQCIAC-NEXT:    mv a0, a1
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IZBAMXQCIAC:       # %bb.0:
+; RV32IZBAMXQCIAC-NEXT:    li a1, 3
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a1, a0, -3
+; RV32IZBAMXQCIAC-NEXT:    mv a0, a1
+; RV32IZBAMXQCIAC-NEXT:    ret
+  %mul = mul i32 %a, -3
+  %add = add i32 %mul, 3
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
index f227fa9aa423..2fa06517508c 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
@@ -105,6 +105,7 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ;
 ; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
 ; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
 ; RV32ZBBXQCIBM-NEXT:    qc.insbi a0, -1, 1, 16
 ; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
 ; RV32ZBBXQCIBM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
index 6b7f9ae85662..88054a691bad 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
@@ -47,6 +47,29 @@ define i32 @test_insbi_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_mask_mv(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_insbi_mask_mv:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    mv a0, a1
+; RV32IXQCIBM-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    mv a0, a1
+; RV32IXQCIBMZBS-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %b, 65535
+  ret i32 %or
+}
+
 define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
 ; RV32I-LABEL: test_insbi_shifted_mask:
 ; RV32I:       # %bb.0:
@@ -67,6 +90,36 @@ define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_shifted_mask_multiple_uses(i32 %a) nounwind {
+; RV32I-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 15
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    addi a0, a0, 10
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    lui a1, 15
+; RV32IXQCIBM-NEXT:    or a1, a1, a0
+; RV32IXQCIBM-NEXT:    addi a0, a0, 10
+; RV32IXQCIBM-NEXT:    xor a0, a0, a1
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    lui a1, 15
+; RV32IXQCIBMZBS-NEXT:    or a1, a1, a0
+; RV32IXQCIBMZBS-NEXT:    addi a0, a0, 10
+; RV32IXQCIBMZBS-NEXT:    xor a0, a0, a1
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %a, 61440
+  %add = add i32 %a, 10
+  %xor = xor i32 %or, %add
+  ret i32 %xor
+}
+
 define i32 @test_single_bit_set(i32 %a) nounwind {
 ; RV32I-LABEL: test_single_bit_set:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/SPARC/float-ua2007.ll b/llvm/test/CodeGen/SPARC/float-ua2007.ll
new file mode 100644
index 000000000000..252b47943fe4
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/float-ua2007.ll
@@ -0,0 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc64 --fp-contract=fast -mattr=-ua2007 < %s | FileCheck %s -check-prefix=NO-UA2007
+; RUN: llc -mtriple=sparc64 --fp-contract=fast -mattr=+ua2007 < %s | FileCheck %s -check-prefix=UA2007
+
+define float @fmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+;
+; UA2007-LABEL: fmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmadds %f1, %f3, %f5, %f0
+  %ret = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  ret float %ret
+}
+
+define double @fmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+;
+; UA2007-LABEL: fmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmaddd %f0, %f2, %f4, %f0
+  %ret = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  ret double %ret
+}
+
+define float @fmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+;
+; UA2007-LABEL: fmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubs %f1, %f3, %f5, %f0
+  %neg = fneg float %c
+  %ret = call float @llvm.fmuladd.f32(float %a, float %b, float %neg)
+  ret float %ret
+}
+
+define double @fmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+;
+; UA2007-LABEL: fmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubd %f0, %f2, %f4, %f0
+  %neg = fneg double %c
+  %ret = call double @llvm.fmuladd.f64(double %a, double %b, double %neg)
+  ret double %ret
+}
+
+define float @fnmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fnmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: fnmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmadds %f1, %f3, %f5, %f0
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %ret = fneg float %fma
+  ret float %ret
+}
+
+define double @fnmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fnmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: fnmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmaddd %f0, %f2, %f4, %f0
+  %fma = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  %ret = fneg double %fma
+  ret double %ret
+}
+
+define float @fnmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fnmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: fnmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubs %f1, %f3, %f5, %f0
+  %neg = fneg float %c
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %neg)
+  %ret = fneg float %fma
+  ret float %ret
+}
+
+define double @fnmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fnmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: fnmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubd %f0, %f2, %f4, %f0
+  %neg = fneg double %c
+  %fma = call double @llvm.fmuladd.f64(double %a, double %b, double %neg)
+  %ret = fneg double %fma
+  ret double %ret
+}
+
+
+define float @combine_madds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_madds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+;
+; UA2007-LABEL: combine_madds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmadds %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %add = fadd float %mul, %c
+  ret float %add
+}
+
+define double @combine_maddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_maddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+;
+; UA2007-LABEL: combine_maddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmaddd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %add = fadd double %mul, %c
+  ret double %add
+}
+
+define float @combine_msubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_msubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+;
+; UA2007-LABEL: combine_msubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubs %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  ret float %sub
+}
+
+define double @combine_msubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_msubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+;
+; UA2007-LABEL: combine_msubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  ret double %sub
+}
+
+define float @combine_nmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_nmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: combine_nmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmadds %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %add = fadd float %mul, %c
+  %neg = fneg float %add
+  ret float %neg
+}
+
+define double @combine_nmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_nmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: combine_nmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmaddd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %add = fadd double %mul, %c
+  %neg = fneg double %add
+  ret double %neg
+}
+
+define float @combine_nmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_nmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: combine_nmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubs %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  %neg = fneg float %sub
+  ret float %neg
+}
+
+define double @combine_nmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_nmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: combine_nmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  %neg = fneg double %sub
+  ret double %neg
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll
new file mode 100644
index 000000000000..de9af01398d2
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/tls-sp.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s
+; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s
+
+@x = external thread_local global i8
+
+;; Test that we don't over-allocate stack space when calling __tls_get_addr
+;; with the call frame pseudos able to be eliminated.
+define ptr @no_alloca() nounwind {
+; SPARC-LABEL: no_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp0:
+; SPARC-NEXT:    call .Ltmp1
+; SPARC-NEXT:  .Ltmp2:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT:  .Ltmp1:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: no_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp0:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp2:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT:  .Ltmp1:
+; SPARC64-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT:    add %i0, %o7, %i0
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC64-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC64-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  ret ptr %0
+}
+
+;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic
+;; alloca in order to prevent eliminating any call frame pseudos from the call.
+define ptr @dynamic_alloca(i64 %n) nounwind {
+; SPARC-LABEL: dynamic_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp3:
+; SPARC-NEXT:    call .Ltmp4
+; SPARC-NEXT:  .Ltmp5:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0
+; SPARC-NEXT:  .Ltmp4:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC-NEXT:    add %i0, %i2, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    add %i1, 7, %i0
+; SPARC-NEXT:    and %i0, -8, %i0
+; SPARC-NEXT:    sub %sp, %i0, %i0
+; SPARC-NEXT:    add %i0, -8, %sp
+; SPARC-NEXT:    mov 1, %i1
+; SPARC-NEXT:    stb %i1, [%i0+88]
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: dynamic_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp3:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp5:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1
+; SPARC64-NEXT:  .Ltmp4:
+; SPARC64-NEXT:    or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1
+; SPARC64-NEXT:    add %i1, %o7, %i1
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC64-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC64-NEXT:    add %i1, %i2, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    add %i0, 15, %i0
+; SPARC64-NEXT:    and %i0, -16, %i0
+; SPARC64-NEXT:    sub %sp, %i0, %i0
+; SPARC64-NEXT:    mov %i0, %sp
+; SPARC64-NEXT:    mov 1, %i1
+; SPARC64-NEXT:    stb %i1, [%i0+2175]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %1 = alloca i8, i64 %n
+  store i8 1, ptr %1
+  ret ptr %0
+}
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50bd716..8a6a30318ae5 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
 ; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
 ; CHECK-NEXT:    larl %r2, f
-; CHECK-NEXT:    llc %r2, 6(%r2)
-; CHECK-NEXT:    larl %r3, e
-; CHECK-NEXT:    lb %r0, 3(%r3)
-; CHECK-NEXT:    rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT:    vlvgp %v0, %r2, %r0
-; CHECK-NEXT:    vlvgf %v0, %r2, 0
-; CHECK-NEXT:    vlvgf %v0, %r2, 2
-; CHECK-NEXT:    vlvgp %v1, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    lr %r1, %r2
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vlvgf %v0, %r0, 2
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
-; CHECK-NEXT:    vlvgf %v1, %r0, 0
-; CHECK-NEXT:    vlvgf %v1, %r0, 2
 ; CHECK-NEXT:    vn %v1, %v1, %v3
 ; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
-; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    vchlf %v1, %v1, %v3
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
 ; CHECK-NEXT:    vlgvf %r3, %v2, 1
 ; CHECK-NEXT:    nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    nilf %r14, 1
 ; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
 ; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
 ; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
 ; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
 ; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT:    vchlf %v0, %v1, %v3
+; CHECK-NEXT:    vchlf %v0, %v0, %v3
 ; CHECK-NEXT:    vlgvf %r13, %v0, 0
 ; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
 ; CHECK-NEXT:    vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/removed-terminator.ll b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
new file mode 100644
index 000000000000..188f6f67eee8
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define void @test(i1 %x) {
+; CHECK-LABEL: test:
+; CHECK:         .functype test (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.xor
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    drop
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    return
+  %y = xor i1 %x, true
+  ; This br_if's operand (%y) is stackified in RegStackify. But this terminator
+  ; will be removed in CFGSort after that. We need to make sure we unstackify %y
+  ; so that it can be dropped in ExplicitLocals.
+  br i1 %y, label %exit, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab47ce0..d9064c684cb2 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovll %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovll 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll 36(%ebp), %ebx
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %ebx
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    cmovgel %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1058,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1089,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 0356c2702a41..a1a4ba81ae49 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128:
@@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128_undef:
@@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c89a37..b7c34070f1af 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovbl %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovbl 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovbl %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovbl 36(%ebp), %ebx
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 27acec32fd34..043c9155f52f 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128:
@@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128_undef:
@@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_minmax_i128:
@@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
@@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index bae140abdf6b..e252d5953e60 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
   ret i128 %r
@@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_sextinreg_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    subl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
@@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %shl = shl i128 %a, 64
   %ashr = ashr exact i128 %shl, 64
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index c2bfcf57185e..1df284fb9fe2 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind {
 define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-LABEL: test_i128_add_add_idx:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    addl 24(%ebp), %esi
+; X86-NEXT:    adcl 28(%ebp), %edi
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128_add_add_idx:
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 0eb2c630e681..f13627b55856 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    movl 24(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%ebp), %ebx
-; CHECK-NEXT:    movl 20(%ebp), %esi
-; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl 28(%ebp), %ebx
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 36(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217ccebdfb77..0de308a9e073 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE2-NEXT:    movapd %xmm4, %xmm5
 ; SSE2-NEXT:    andpd %xmm1, %xmm5
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    andpd %xmm2, %xmm3
-; SSE2-NEXT:    xorpd %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT:    movapd %xmm2, %xmm3
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
@@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm4, %eax
-; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT:    vpextrw $5, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpextrw $2, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm5
-; AVX1-NEXT:    vpextrw $1, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm6
-; AVX1-NEXT:    vpextrw $0, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm5
+; AVX1-NEXT:    vpextrw $1, %xmm3, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm6
+; AVX1-NEXT:    vpextrw $0, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm7
-; AVX1-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm8
-; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm8
+; AVX1-NEXT:    vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT:    decq %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm9
-; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm10
-; AVX1-NEXT:    vpextrw $0, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm11
-; AVX1-NEXT:    vpextrw $5, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm10
+; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm11
+; AVX1-NEXT:    vpextrw $4, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm12
-; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm13
-; AVX1-NEXT:    vpextrw $5, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm14
-; AVX1-NEXT:    vpextrw $4, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm15
-; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
 ; AVX1-NEXT:    decl %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
+; AVX1-NEXT:    vmovd %ecx, %xmm13
+; AVX1-NEXT:    vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm14
+; AVX1-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX1-NEXT:    decq %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm15
+; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm2
 ; AVX1-NEXT:    decl %eax
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX1-NEXT:    vmovd %eax, %xmm5
-; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT:    vmovd %ecx, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT:    vmovd %edx, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index f66f0c0ceabc..cc58bc1e44f3 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) {
 ;
 ; X86-LABEL: s128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floattihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = sitofp i128 %x to half
   ret half %a
@@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) {
 ;
 ; X86-LABEL: u128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floatuntihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = uitofp i128 %x to half
   ret half %a
@@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind {
 ;
 ; X86-LABEL: f128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __trunctfhf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a = fptrunc fp128 %x to half
   ret half %a
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd..33381313d3c1 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind {
 define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86-LABEL: bitselect_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    andl 56(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    andl 64(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    andl 68(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: bitselect_i128:
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 312f94c04123..143e10e6909e 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB8_7
-; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %eax, %edx
-; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB8_5
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %ecx, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:  .LBB8_5: # %cond.false
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
-; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl %ebx, %edx
 ; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %edi, %edx
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    jne .LBB8_13
-; X86-NEXT:  # %bb.12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB8_13: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %edi
-; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %ecx, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    rep bsfl %edi, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:  .LBB9_10: # %select.true.sink
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index fbca4af425ea..ab0478a4e944 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl $128, %esi
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    bsrl %ebp, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl 36(%ebp), %edx
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    jmp .LBB8_14
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:  .LBB9_6:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_7:
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl 32(%ebp), %ebx
 ; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 6d5e995a6d57..673b7f16de75 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind {
 define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-LABEL: bswap_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    bswapl %edx
@@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i128:
 ; X86-MOVBE:       # %bb.0:
+; X86-MOVBE-NEXT:    pushl %ebp
+; X86-MOVBE-NEXT:    movl %esp, %ebp
 ; X86-MOVBE-NEXT:    pushl %edi
 ; X86-MOVBE-NEXT:    pushl %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
+; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
+; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i128:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8ec01a5..455b72d16a07 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -172,16 +172,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    orl $32, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    orl $64, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -488,13 +487,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %ecx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %ebx, 8(%ecx)
 ; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -508,7 +507,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -523,17 +522,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
@@ -543,13 +542,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7bbddefd8272..adcfee5959af 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,60 +152,60 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    orl 48(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    orl 24(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    orl 20(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    bsrl 48(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebx, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    addl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 12(%ebp), %edx
+; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    orl $64, %edx
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -237,30 +237,30 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    movb %cl, %ah
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    cmovnel %esi, %ebx
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    cmovnel %esi, %ecx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    orb %ah, %al
-; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %eax
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, %ecx
@@ -289,7 +289,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    xorl %edx, %edx
@@ -299,13 +299,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -334,16 +334,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 52(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl %eax, %eax
@@ -391,13 +391,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 40(%ebp), %eax
+; X86-NEXT:    andl 52(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    andl 48(%ebp), %eax
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl 32(%ebp), %edx
-; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    andl 44(%ebp), %edx
+; X86-NEXT:    andl 40(%ebp), %ecx
 ; X86-NEXT:    subl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -437,7 +437,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %eax
 ; X86-NEXT:  .LBB4_7: # %udiv-end
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
@@ -446,23 +446,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 52(%ebp), %edi
 ; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull 28(%ebp), %ebx
+; X86-NEXT:    imull 40(%ebp), %ebx
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edx
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -471,7 +471,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -483,26 +483,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
similarity index 99%
rename from llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
rename to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
index 17de405928d3..3810e227bea9 100644
--- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
+++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
-# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
+#
+# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
+#
 --- |
   @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
   declare i64 @f0()
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 707b05f3478d..bb5640aeb66f 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba..6d4ec063ccd4 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vi128, %eax
+; X86-NEXT:    movl vi128+4, %ecx
+; X86-NEXT:    movl vi128+8, %edx
+; X86-NEXT:    movl vi128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi128+12
-; X86-NEXT:    pushl vi128+8
-; X86-NEXT:    pushl vi128+4
-; X86-NEXT:    pushl vi128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vu128, %eax
+; X86-NEXT:    movl vu128+4, %ecx
+; X86-NEXT:    movl vu128+8, %edx
+; X86-NEXT:    movl vu128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu128+12
-; X86-NEXT:    pushl vu128+8
-; X86-NEXT:    pushl vu128+4
-; X86-NEXT:    pushl vu128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ;
 ; X86-LABEL: TestBits128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orl (%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestBits128:
@@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866:
@@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866_commute:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a7eea04181f6..ad2d690fd7ed 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll expf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _expl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll exp2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _exp2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll logf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _logl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log10f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log10l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmaxl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fminf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fminl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll powf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _powl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __powitf2
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___powitf2
-; WIN-X86-NEXT:    addl $24, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundevenf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundevenl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: atan2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmp:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB37_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB37_3
 ; WIN-X86-NEXT:  LBB37_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
                                                fp128 %x, fp128 %y,
@@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmps:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB38_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB38_3
 ; WIN-X86-NEXT:  LBB38_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
                                                fp128 %x, fp128 %y,
@@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_ueq_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB39_3
 ; WIN-X86-NEXT:  LBB39_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB39_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
@@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_one_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
@@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    testb %bl, %al
 ; WIN-X86-NEXT:    jne LBB40_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB40_3
 ; WIN-X86-NEXT:  LBB40_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB40_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index f727a7907862..4b0449fd7502 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-NEXT:    movaps %xmm1, 16(%esi)
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    subl $112, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    pushl %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    addl $24, %esp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42697d9..953a5e7285fe 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%edx), %xmm0
+; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%esi), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%esi), %xmm0
+; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ;
 ; X64-LABEL: freeze_extractelement_escape:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 12(%ebp), %esi
 ; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%esi), %xmm0
-; X86-NEXT:    vandps (%edi), %xmm0, %xmm0
+; X86-NEXT:    vmovaps (%edi), %xmm0
+; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $15, %ecx
 ; X64-NEXT:    andl $15, %edx
-; X64-NEXT:    vmovaps (%rsi), %xmm0
-; X64-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandps (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; X64-NEXT:    cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index e8c8ccfa8d37..ec1b8a3c8d6d 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %edx
+; X86-FAST-NEXT:    movl 48(%ebp), %esi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %eax
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %edx, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-FAST-NEXT:    jmp .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_4
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %ebx, %eax
-; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
-; X86-FAST-NEXT:    movl %edi, %ebp
-; X86-FAST-NEXT:    shldl %cl, %ebx, %ebp
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    shldl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shldl %cl, %esi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl %edx, 12(%ecx)
-; X86-FAST-NEXT:    movl %ebx, 8(%ecx)
-; X86-FAST-NEXT:    movl %ebp, 4(%ecx)
-; X86-FAST-NEXT:    movl %eax, (%ecx)
-; X86-FAST-NEXT:    movl %ecx, %eax
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %eax, %edx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %eax
+; X86-SLOW-NEXT:    movl 48(%ebp), %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
 ; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %al
-; X86-SLOW-NEXT:    je .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl 36(%ebp), %eax
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebp
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %ebp
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    notb %ch
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    movl %edi, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    orl %eax, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
 ; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %esi, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebp, (%eax)
-; X86-SLOW-NEXT:    addl $4, %esp
+; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8fd484a..544ab7fc7737 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    pushl %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %esi
+; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 48(%ebp), %edx
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %ebx
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl %esi, %ebp
-; X86-FAST-NEXT:    movl %ebx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl 32(%ebp), %esi
+; X86-FAST-NEXT:    movl %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl 36(%ebp), %eax
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 44(%ebp), %edi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, %ebx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl %edx, %ebx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %esi, %edx
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edi
-; X86-FAST-NEXT:    movl %edi, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %ebp, (%eax)
-; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $16, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %esi
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
+; X86-SLOW-NEXT:    movl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl 36(%ebp), %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl 40(%ebp), %eax
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    addl %ebx, %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl %edx, %ebx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %ebp, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebp, 8(%ecx)
+; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af3..df97f49440f7 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
-; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    cmovnel %ebx, %eax
-; X86-SSE2-NEXT:    movl %edx, %ebp
-; X86-SSE2-NEXT:    cmovnel %edi, %ebp
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movl %edx, %ecx
+; X86-SSE2-NEXT:    cmovnel %edi, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esi, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %ebx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %esi, %edx
-; X86-SSE2-NEXT:    cmovnel %ebp, %esi
-; X86-SSE2-NEXT:    cmovnel %eax, %ebp
-; X86-SSE2-NEXT:    cmovel %edi, %ebx
+; X86-SSE2-NEXT:    cmovnel %edx, %esi
+; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
 ; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT:    movl %ebp, %ebx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT:    cmovel %ecx, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
-; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 2849e448a053..b4546c1e983c 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -5,17 +5,20 @@
 define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: add_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    addl 40(%ebp), %esi
+; X86-NEXT:    adcl 44(%ebp), %edi
+; X86-NEXT:    adcl 48(%ebp), %ecx
+; X86-NEXT:    adcl 52(%ebp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add_i128:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 4152dcf07f7e..2174d5056e6c 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -55,41 +55,47 @@ define void @store(PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 20(%esp), %ecx
+; CHECK-X86-NEXT:    movl 24(%esp), %edx
+; CHECK-X86-NEXT:    movl 28(%esp), %esi
+; CHECK-X86-NEXT:    movl 32(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
 }
 
 ; Illustrate stack alignment
-; FIXME(#77401): alignment on x86-32 is ABI-incorrect.
 define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X64-F128-LABEL: store_perturbed:
 ; CHECK-X64-F128:       # %bb.0:
@@ -130,34 +136,41 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 16(%esp), %eax
-; CHECK-X86-NEXT:    movl 20(%esp), %ecx
-; CHECK-X86-NEXT:    movl 24(%esp), %edx
-; CHECK-X86-NEXT:    movl 28(%esp), %esi
-; CHECK-X86-NEXT:    movl 32(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %eax
+; CHECK-X86-NEXT:    movl 36(%esp), %ecx
+; CHECK-X86-NEXT:    movl 40(%esp), %edx
+; CHECK-X86-NEXT:    movl 44(%esp), %esi
+; CHECK-X86-NEXT:    movl 48(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store_perturbed:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 32(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
@@ -271,34 +284,41 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl 40(%esp), %esi
+; CHECK-X86-NEXT:    movl 44(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -344,34 +364,41 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 48(%esp), %ecx
-; CHECK-X86-NEXT:    movl 52(%esp), %edx
-; CHECK-X86-NEXT:    movl 56(%esp), %esi
-; CHECK-X86-NEXT:    movl 60(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 48(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 52(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -417,34 +444,41 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 72(%esp), %ecx
-; CHECK-X86-NEXT:    movl 76(%esp), %edx
-; CHECK-X86-NEXT:    movl 80(%esp), %esi
-; CHECK-X86-NEXT:    movl 84(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 80(%esp), %ecx
+; CHECK-X86-NEXT:    movl 84(%esp), %edx
+; CHECK-X86-NEXT:    movl 88(%esp), %esi
+; CHECK-X86-NEXT:    movl 92(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 72(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 76(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 80(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 84(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -488,34 +522,41 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 56(%esp), %ecx
-; CHECK-X86-NEXT:    movl 60(%esp), %edx
-; CHECK-X86-NEXT:    movl 64(%esp), %esi
-; CHECK-X86-NEXT:    movl 68(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -571,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    movl 64(%esp), %eax
+; CHECK-X86-NEXT:    movl 68(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72(%esp), %edx
+; CHECK-X86-NEXT:    movl 76(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 28(%esp)
+; CHECK-X86-NEXT:    movl %edx, 24(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
+; CHECK-X86-NEXT:    movl %eax, 16(%esp)
+; CHECK-X86-NEXT:    leal 32(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    addl $52, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
+; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -686,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -836,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    movl 112(%esp), %eax
+; CHECK-X86-NEXT:    movl 116(%esp), %ecx
+; CHECK-X86-NEXT:    movl 120(%esp), %edx
+; CHECK-X86-NEXT:    movl 124(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 76(%esp)
+; CHECK-X86-NEXT:    movl %edx, 72(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
+; CHECK-X86-NEXT:    movl %eax, 64(%esp)
+; CHECK-X86-NEXT:    leal 80(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 60(%esp)
+; CHECK-X86-NEXT:    movl $0, 56(%esp)
+; CHECK-X86-NEXT:    movl $0, 52(%esp)
+; CHECK-X86-NEXT:    movl $0, 48(%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    addl $100, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
+; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 48(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -975,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 717f52f198ee..7d5757392c98 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -8,18 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
@@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
@@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind {
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl $30, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl 28(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    shrdl $2, %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %edx
-; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %ebx
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3f890b7f2443..901183242132 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -8,15 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 55c318e87a5a..bdceeefbcfab 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind {
 define i128 @test_i128(i128 %a) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index c52b3ed6c926..4a6c1d0ae5de 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_lt_power_of_2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $1, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shrdl $28, %ebx, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -73,15 +79,21 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
 ; X86-NEXT:    shldl $17, %edx, %esi
 ; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
@@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    calll use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
diff --git a/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
new file mode 100644
index 000000000000..e272e7ee3cb0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
+#
+# Check that only the computed gotos are duplicated aggressively.
+#
+--- |
+  @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
+  declare i64 @f0()
+  declare i64 @f1()
+  declare i64 @f2()
+  declare i64 @f3()
+  declare i64 @f4()
+  declare i64 @f5()
+  define void @computed_goto() {
+    start:
+      ret void
+    bb1:
+      ret void
+    bb2:
+      ret void
+    bb3:
+      ret void
+    bb4:
+      ret void
+  }
+  define void @jump_table() { ret void }
+  define void @jump_table_pic() { ret void }
+...
+---
+name:            computed_goto
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: computed_goto
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  bb.0:
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %0:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %0
+    JMP_1 %bb.5
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %1:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %1
+    JMP_1 %bb.5
+
+  bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %2:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %2
+    JMP_1 %bb.5
+
+  bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %3:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %3
+    JMP_1 %bb.5
+
+  bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %4:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %4
+
+  bb.5:
+    successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+
+    %5:gr64_nosp = COPY %6
+    JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg
+...
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fc1cc1f65627..e10e48f9aea0 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
   %k = mul i128 %t, %u
   ret i128 %k
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 961205c50d97..724b2dc4c431 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-LABEL: neg_abs_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    subl %ebx, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: sub_abs_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sub_abs_i128:
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 35c7c0e09f39..3004b8b72fcc 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-LABEL: cnt128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128:
@@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-LABEL: cnt128_optsize:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
 ; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edx, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    andl %edx, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128_pgso:
@@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:
@@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/X86/pr154492.ll b/llvm/test/CodeGen/X86/pr154492.ll
new file mode 100644
index 000000000000..1ba17594976e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr154492.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f  | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+define <16 x i32> @PR154492() {
+; AVX512F-LABEL: PR154492:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    vmovaps %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR154492:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> zeroinitializer, <16 x i32> zeroinitializer, i16 255, i32 4)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index f7c7da089c36..829d6dfceba3 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -6,7 +6,17 @@
 define void @fuzz22357(i128 %a0) {
 ; X86-LABEL: fuzz22357:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22357:
@@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) {
 define void @fuzz22723(i128 %a0) {
 ; X86-LABEL: fuzz22723:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22723:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 50a967e1c2a1..ce9723b3a84b 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind {
 define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u32:
@@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
@@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
@@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s32:
@@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
@@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index f516db8b30ff..3287869f2c60 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind {
 define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u64:
@@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
@@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
@@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s64:
@@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
@@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 874913629e9e..8a287229a1cb 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: scmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    cmpl %ecx, 8(%ebp)
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    cmpl 8(%ebp), %esi
+; X86-NEXT:    sbbl 12(%ebp), %eax
+; X86-NEXT:    sbbl 16(%ebp), %edi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8bc6c8b..392bc83d9d5d 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $112, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab617..7df490f98492 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edi
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %dl
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
 ; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovgel %ecx, %ebx
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    cmovgel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ebx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NEXT:    shrl $31, %ebx
 ; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
@@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    setne %bh
+; X86-NEXT:    testb %bl, %bh
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
-; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    cmovgel %eax, %esi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    shldl $31, %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f1a799..2ac2be5545df 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
 ; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 76cb4e87bae1..dfeef48897e0 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
 ; X86-LABEL: combineShiftOfShiftedLogic:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %eax, 20(%ecx)
 ; X86-NEXT:    movl $0, 16(%ecx)
 ; X86-NEXT:    movl $0, 12(%ecx)
 ; X86-NEXT:    movl $0, 8(%ecx)
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combineShiftOfShiftedLogic:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a..9323cd5b1917 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -212,9 +212,18 @@ entry:
 }
 
 define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_lshr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_lshr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_lshr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = lshr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -222,9 +231,18 @@ entry:
 }
 
 define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_ashr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_ashr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_ashr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = ashr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -232,9 +250,18 @@ entry:
 }
 
 define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_shl_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_shl_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_shl_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = shl i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
 define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-LABEL: lshr_shl_mask:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %ebp, -8
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    .cfi_def_cfa_register %ebp
+; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    .cfi_offset %esi, -16
+; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
 ; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl 36(%ebp), %edi
 ; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 4
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
 ;
 ; x86_64-LABEL: lshr_shl_mask:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 86891e964d96..509d4443e930 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 8907f6c4cd59..5e9fe27b41d2 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ecx
+; X86-NEXT:    cmovll 28(%ebp), %edx
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acfe2fb3..7f17299b39e3 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: ucmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    cmpl %eax, 24(%ebp)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e0545502b..82dfeeee1329 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80b2315..3da5973f9f90 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f589d4a7b04a..7ef859978cdb 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-LABEL: test_i128_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    cmovel %edi, %ebp
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl $0, 28(%ebp)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    cmovel 28(%ebp), %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 7a5cdbb9ce75..c927abf3a426 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ecx
+; X86-NEXT:    cmovbl 28(%ebp), %edx
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b98..89afd1b00444 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 087cd30abee9..9bd38db3e9c4 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -104,6 +104,72 @@ entry:
 }
 declare <2 x double> @foo()
 
+define i64 @pr150117(<31 x i8> %a0) nounwind {
+; X86-LABEL: pr150117:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    shll $8, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shll $24, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    pinsrw $2, %edx, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    pinsrw $3, %ecx, %xmm0
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movd %xmm0, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: pr150117:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
+; X64-NEXT:    shll $8, %r8d
+; X64-NEXT:    orl %edi, %r8d
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    orl %edx, %esi
+; X64-NEXT:    shll $16, %ecx
+; X64-NEXT:    orl %esi, %ecx
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    shll $24, %edx
+; X64-NEXT:    orl %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    pinsrw $2, %r8d, %xmm0
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    retq
+  %shuffle = shufflevector <31 x i8> %a0, <31 x i8> zeroinitializer, <32 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bitcast = bitcast <32 x i8> %shuffle to <4 x i64>
+  %elt = extractelement <4 x i64> %bitcast, i64 0
+  ret i64 %elt
+}
+
 ; OSS-Fuzz #15662
 ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15662
 define <4 x i32> @ossfuzz15662(ptr %in) {
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135ddcec9..ef20cf2a09bb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1724,6 +1724,269 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
   ret void
 }
 
+define <512 x i8> @PR153457(<512 x i8> %a0, <512 x i8> %a1) nounwind {
+; AVX512F-LABEL: PR153457:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $64, %rsp
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512F-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512F-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512F-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512F-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: PR153457:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm8
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm8
+; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm10
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm5, %xmm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpslld $24, %xmm0, %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm8
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vpsrld $24, %xmm7, %xmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $3, %xmm7, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,1,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpbroadcastb 16(%rbp), %ymm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: PR153457:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:    andq $-64, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512DQ-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512DQ-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: PR153457:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-64, %rsp
+; AVX512VBMI-NEXT:    subq $64, %rsp
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm5, %zmm8
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm4, %zmm5
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0]
+; AVX512VBMI-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm3, %zmm4
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67,0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67]
+; AVX512VBMI-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm2, %zmm3
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,71]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm6, %zmm2
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,0,64,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm0, %zmm6
+; AVX512VBMI-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [67,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,68,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm4, %zmm3
+; AVX512VBMI-NEXT:    vinserti32x4 $3, %xmm7, %zmm5, %zmm4
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm8, %zmm5
+; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,65,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm1, %zmm8
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, 128(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm8, 64(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm6, (%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, 384(%rdi)
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %shuffle1 = shufflevector <512 x i8> %a0, <512 x i8> zeroinitializer, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle2 = shufflevector <512 x i8> %shuffle1, <512 x i8> %a1, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 512, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 513, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 514, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 515, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 516, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 517, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 518, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 519, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <512 x i8> %shuffle2
+}
+
 define <64 x i8> @shuffle_v32i16_zextinreg_to_v16i32(<64 x i8> %a)  {
 ; ALL-LABEL: shuffle_v32i16_zextinreg_to_v16i32:
 ; ALL:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633d8538..12dccca76eb1 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %esi, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:    jge .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB4_2: # %bb2
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:  .LBB4_3: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:  .LBB4_2: # %bb1
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
new file mode 100644
index 000000000000..5ac90a0af2e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s
+
+define i64 @test_sdiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_sdiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_sdiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    retq
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_srem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_srem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_srem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_udiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_udiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_udiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    retq
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_urem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_urem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_urem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_mul_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_mul_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:    mull %esi
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl %ecx, %edx
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_mul_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    imulq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
index 0f8f505c51a5..5d73b2669ccd 100644
--- a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
@@ -7,6 +7,8 @@
 
 define dso_local void @f() !dbg !10 {
 entry:
+; Include non-key location to check verifier is checking the whole function.
+  %0 = add i32 0, 0, !dbg !14
   ret void, !dbg !13
 }
 
@@ -20,3 +22,4 @@ entry:
 !11 = !DISubroutineType(types: !12)
 !12 = !{null}
 !13 = !DILocation(line: 1, column: 11, scope: !10, atomGroup: 1, atomRank: 1)
+!14 = !DILocation(line: 1, column: 11, scope: !10)
diff --git a/llvm/test/MC/Hexagon/system-inst.s b/llvm/test/MC/Hexagon/system-inst.s
index 7bc153359853..07f7ca0acb2d 100644
--- a/llvm/test/MC/Hexagon/system-inst.s
+++ b/llvm/test/MC/Hexagon/system-inst.s
@@ -89,6 +89,9 @@ crswap(r12,sgp0)
 #CHECK: 652dc000 { crswap(r13,sgp1) }
 crswap(r13,sgp1)
 
+#CHECK: 6d8ec000 { crswap(r15:14,s1:0) }
+crswap(r15:14,sgp1:0)
+
 #CHECK: 660fc00e { r14 = getimask(r15) }
 r14=getimask(r15)
 
diff --git a/llvm/test/MC/Hexagon/two_ext.s b/llvm/test/MC/Hexagon/two_ext.s
index 28b2aa3f1eca..09b51c5f029a 100644
--- a/llvm/test/MC/Hexagon/two_ext.s
+++ b/llvm/test/MC/Hexagon/two_ext.s
@@ -6,7 +6,7 @@
   if (!p1) call foo_b
 }
 # CHECK: 00004000 { immext(#0)
-# CHECK: 5d004100   if (p1) call 0x0
+# CHECK: 5d004100   if (p1) call 0x0 <.text>
 # CHECK: 00004000   immext(#0)
-# CHECK: 5d20c100   if (!p1) call 0x0 }
+# CHECK: 5d20c100   if (!p1) call 0x0 <.text> }
 
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index 24f3e67ebbdd..c646e6f54b26 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -2,12 +2,12 @@
 # RUN: llvm-mc -filetype=obj --triple=riscv64 --mattr=+relax %s -debug-only=mc-dump -o /dev/null 2>&1 | FileCheck %s
 
 #      CHECK:Sections:[
-# CHECK-NEXT:MCSection Name:.text
+# CHECK-NEXT:MCSection Name:.text LinkerRelaxable
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 .text
 # CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
-# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023
+# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023 LinkerRelaxable
 # CHECK-NEXT:  Symbol @0 $x
 # CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
 # CHECK-NEXT:12 Data Size:4 [13,05,30,00]
diff --git a/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s b/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s
new file mode 100644
index 000000000000..b12585265bf1
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s
@@ -0,0 +1,84 @@
+# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcilb,+experimental-xqcibi \
+# RUN:    %s -filetype=obj -o - -riscv-add-build-attributes \
+# RUN:    | llvm-objdump -dr -M no-aliases - \
+# RUN:    | FileCheck %s
+
+## This tests that we correctly emit relocations for linker relaxation when
+## relaxing `JAL` to `QC.E.JAL`.
+
+## PR150071
+
+.global foo
+
+# CHECK-LABEL: <branch_over_relaxable>:
+branch_over_relaxable:
+  jal x1, foo
+# CHECK: qc.e.jal 0x0 <branch_over_relaxable>
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM
+# CHECK-NEXT: R_RISCV_CUSTOM195 foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+  bne a0, a1, branch_over_relaxable
+# CHECK-NEXT: bne a0, a1, 0x6 <branch_over_relaxable+0x6>
+# CHECK-NEXT: R_RISCV_BRANCH branch_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  qc.e.bnei a0, 0x21, branch_over_relaxable
+# CHECK-NEXT: qc.e.bnei a0, 0x21, 0xa <branch_over_relaxable+0xa>
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM
+# CHECK-NEXT: R_RISCV_CUSTOM193 branch_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <short_jump_over_fixed>:
+short_jump_over_fixed:
+  nop
+# CHECK: c.nop
+  j short_jump_over_fixed
+# CHECK-NEXT: c.j 0x14 <short_jump_over_fixed+0x2>
+# CHECK-NEXT: R_RISCV_RVC_JUMP short_jump_over_fixed
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <short_jump_over_relaxable>:
+short_jump_over_relaxable:
+  call foo
+# CHECK: auipc ra, 0x0
+# CHECK-NEXT: R_RISCV_CALL_PLT foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+# CHECK-NEXT: jalr ra, 0x0(ra) <short_jump_over_relaxable>
+  j short_jump_over_relaxable
+# CHECK-NEXT: c.j 0x20 <short_jump_over_relaxable+0x8>
+# CHECK-NEXT: R_RISCV_RVC_JUMP short_jump_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <mid_jump_over_fixed>:
+mid_jump_over_fixed:
+  nop
+# CHECK: c.nop
+  .space 0x1000
+# CHECK-NEXT: ...
+  j mid_jump_over_fixed
+# CHECK-NEXT: jal zero, 0x1026 <mid_jump_over_fixed+0x1002>
+# CHECK-NEXT: R_RISCV_JAL mid_jump_over_fixed
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <mid_jump_over_relaxable>:
+mid_jump_over_relaxable:
+  call foo
+# CHECK: auipc ra, 0x0
+# CHECK-NEXT: R_RISCV_CALL_PLT foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+# CHECK-NEXT: jalr ra, 0x0(ra) <mid_jump_over_relaxable>
+  .space 0x1000
+# CHECK-NEXT: ...
+  j mid_jump_over_relaxable
+# CHECK-NEXT: jal zero, 0x2034 <mid_jump_over_relaxable+0x1008>
+# CHECK-NEXT: R_RISCV_JAL mid_jump_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
diff --git a/llvm/test/MC/X86/intel-syntax-parentheses.s b/llvm/test/MC/X86/intel-syntax-parentheses.s
new file mode 100644
index 000000000000..ae53f6408907
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-parentheses.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2>&1 | FileCheck %s
+
+.intel_syntax
+
+// CHECK: error: invalid base+index expression
+    lea     rdi, [(label + rsi) + rip]
+// CHECK: leaq    1(%rax,%rdi), %rdi
+    lea     rdi, [(rax + rdi) + 1]
+label:
+    .quad 42
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 7abc32e4f1cd..f53127f01539 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -1065,3 +1065,37 @@ for.body:
   %exitcond.not = icmp eq i32 %inc, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+define i64 @pr150611_add_offset_is_not_loop_invariant(i1 %cond) {
+; CHECK-LABEL: define i64 @pr150611_add_offset_is_not_loop_invariant(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[REMAMT:%.*]] = select i1 [[COND]], i64 2, i64 0
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_OFFSET:%.*]] = zext i1 [[COND]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[INDVARS]], [[ADD_OFFSET]]
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[ADD]], [[REMAMT]]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+entry:
+  %remamt = select i1 %cond, i64 2, i64 0
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %add.offset = zext i1 %cond to i64
+  %add = add nuw i64 %indvars, %add.offset
+  %rem = urem i64 %add, %remamt
+  %indvars.next = add nuw i64 %indvars, 1
+  %exitcond = icmp eq i64 %indvars.next, 3
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret i64 %rem
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
index e390d4bdca63..303afc207c02 100644
--- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
@@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) {
   ret ptr %call
 }
 
-declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
+declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
 ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]]
 ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" }
diff --git a/llvm/test/Transforms/GVN/cond_br.ll b/llvm/test/Transforms/GVN/cond_br.ll
index 19166d17a832..fb84b626c745 100644
--- a/llvm/test/Transforms/GVN/cond_br.ll
+++ b/llvm/test/Transforms/GVN/cond_br.ll
@@ -53,3 +53,22 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 declare void @bar(i32)
+
+define void @indirectbr_could_not_split() {
+; CHECK-LABEL: define void @indirectbr_could_not_split() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[IBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IBR]]:
+; CHECK-NEXT:    indirectbr ptr null, [label %[[EXIT]], label %exit]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 false, label %ibr, label %exit
+
+ibr:
+  indirectbr ptr null, [label %exit, label %exit]
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
index b5c4f42655bb..71b463b2d2b0 100644
--- a/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
+++ b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
@@ -12,28 +12,28 @@ entry:
 
 declare i32 @baz(...) #0
 
-define i32 @bar() #1 {
+define i32 @features_subset() #1 {
 entry:
   %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: bar
-; CHECK: call i32 @foo()
+; CHECK-LABEL: features_subset
+; CHECK: call i32 (...) @baz()
 }
 
-define i32 @qux() #0 {
+define i32 @features_equal() #0 {
 entry:
   %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: qux
+; CHECK-LABEL: features_equal
 ; CHECK: call i32 (...) @baz()
 }
 
-define i32 @quux() #2 {
+define i32 @features_different() #2 {
 entry:
-  %call = call i32 @bar()
+  %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: quux
-; CHECK: call i32 @bar()
+; CHECK-LABEL: features_different
+; CHECK: call i32 @foo()
 }
 
 
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index ab4448b460bf..820fff433e9e 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -213,7 +213,7 @@ define double @fmul_nnan_ninf_nneg_n0.0_commute(i127 %x) {
 
 define float @fmul_ninf_nnan_mul_zero_nsz(float nofpclass(inf nan) %f) {
 ; CHECK-LABEL: @fmul_ninf_nnan_mul_zero_nsz(
-; CHECK-NEXT:     ret float 0.000000e+00
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %r = fmul nsz float %f, 0.0
   ret float %r
@@ -221,7 +221,7 @@ define float @fmul_ninf_nnan_mul_zero_nsz(float nofpclass(inf nan) %f) {
 
 define float @fmul_ninf_nnan_mul_nzero_nsz(float nofpclass(inf nan) %f) {
 ; CHECK-LABEL: @fmul_ninf_nnan_mul_nzero_nsz(
-; CHECK-NEXT:     ret float 0.000000e+00
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %r = fmul nsz float %f, -0.0
   ret float %r
@@ -1255,3 +1255,20 @@ define i1 @fptrunc_round_unknown_positive(double %unknown) {
   %cmp = fcmp nnan oge float %op, 0.0
   ret i1 %cmp
 }
+
+define half @fabs_select_fabs(half noundef %x) {
+; CHECK-LABEL: @fabs_select_fabs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS1:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt half [[ABS1]], 0xH0000
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], half [[X]], half 0xH0000
+; CHECK-NEXT:    [[ABS2:%.*]] = call half @llvm.fabs.f16(half [[SEL]])
+; CHECK-NEXT:    ret half [[ABS2]]
+;
+entry:
+  %abs1 = call half @llvm.fabs.f16(half %x)
+  %cmp = fcmp ogt half %abs1, 0xH0000
+  %sel = select i1 %cmp, half %x, half 0xH0000
+  %abs2 = call half @llvm.fabs.f16(half %sel)
+  ret half %abs2
+}
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
new file mode 100644
index 000000000000..dd524ab7d140
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+
+define void @test(ptr %addr) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[ADDR:%.*]]) {
+; CHECK-NEXT:    indirectbr ptr [[ADDR]], [label %[[A:.*]], label %C]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br i1 true, label %[[B:.*]], label %[[C_LOOPEXIT:.*]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    br i1 true, label %[[A]], label %[[C_LOOPEXIT]]
+; CHECK:       [[C_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[C:.*]]
+; CHECK:       [[C]]:
+; CHECK-NEXT:    unreachable
+;
+
+  indirectbr ptr %addr, [label %A, label %C]
+
+A:
+  br i1 true, label %B, label %C
+
+B:
+  br i1 true, label %A, label %C
+
+C:
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 451574a258c2..427a05cc1c84 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index e93ee5563b05..1a8e5940d88e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index b2e080fef2e5..a2eddad17921 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 5661406b88a5..1ca5586942d7 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -10
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 10, [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 10, [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[START]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[START]], [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_with_additional_add(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]]
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -247,14 +496,19 @@ entry:
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ]
+  %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %sum.next = add i32 %sum, %l.src.2
   %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
   %l = load float, ptr %gep.src, align 4
-  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp eq i64 %iv.next, %n
   br i1 %ec, label %exit, label %loop
 
 exit:
+  store i32 %sum.next, ptr %src.2
   ret float %max.next
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 148beb64a360..68bc8d0640a3 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
index 85a90f2e04c5..e7ab02cd98a5 100644
--- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -1001,8 +1001,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmin_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.minnum.v2f32
+; CHECK: <2 x float> @llvm.minnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
@@ -1021,8 +1023,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmax_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32
+; CHECK: <2 x float> @llvm.maxnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll
new file mode 100644
index 000000000000..3a1d1324d8e4
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+target triple = "aarch64"
+
+define <vscale x 4 x float> @scalable_vec_log(<vscale x 4 x float> %input) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_vec_log(
+; CHECK-SAME: <vscale x 4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x float> [ [[INPUT]], [[TMP0]] ], [ [[TMP8:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x float> [[TMP5]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.log.f32(float [[TMP6]])
+; CHECK-NEXT:    [[TMP8]] = insertelement <vscale x 4 x float> [[TMP5]], float [[TMP7]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB3]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP8]]
+;
+  %output = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %input)
+  ret <vscale x 4 x float> %output
+}
+
+define <4 x float> @fixed_vec_log(<4 x float> %input) {
+; CHECK-LABEL: define <4 x float> @fixed_vec_log(
+; CHECK-SAME: <4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[OUTPUT:%.*]] = call <4 x float> @llvm.log.v4f32(<4 x float> [[INPUT]])
+; CHECK-NEXT:    ret <4 x float> [[OUTPUT]]
+;
+  %output = call <4 x float> @llvm.log.v4f32(<4 x float> %input)
+  ret <4 x float> %output
+}
+
+declare <4 x float> @llvm.log.v4f32(<4 x float>) #0
+declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 5e3fd156666f..410696260a85 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -16,12 +16,11 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
index 2b5ee59aeb16..96dd691c4816 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
@@ -5,24 +5,21 @@ define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-LABEL: define i16 @foo(
 ; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[TMP10]], 65535
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[AND1]], 65533
 ; CHECK-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
 ; CHECK-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[TMP13]], 65535
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
 ; CHECK-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605
 ; CHECK-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
index a1f4590a5691..04c69106d97f 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 667fc41c069e..10a17f7e3f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -645,20 +645,21 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo)
 ; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
 ; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
 ; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
 ; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0
+; CHECK-NEXT:    [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8
 ; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
 ; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
 ; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O3]], 0
+; CHECK-NEXT:    [[O2:%.*]] = or i8 [[E4]], [[E3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i8 [[O3]], [[O2]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O4]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %l = load <2 x i64>, ptr %values, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 8d7d31ab9844..f397290299a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -7,54 +7,72 @@ define void @h(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV9:%.*]] = zext i16 [[A]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[E]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[I]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[M]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[G]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[K]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[O]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[C]], i32 3
+; CHECK-NEXT:    [[CONV310:%.*]] = zext i16 [[B]] to i32
+; CHECK-NEXT:    [[ADD4:%.*]] = or i32 [[CONV310]], [[CONV9]]
+; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[CONV310]]
+; CHECK-NEXT:    [[CONV15:%.*]] = sext i16 [[C]] to i32
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24
 ; CHECK-NEXT:    [[CONV19:%.*]] = sext i16 [[D]] to i32
 ; CHECK-NEXT:    [[SUB20:%.*]] = or i32 [[SHR]], [[CONV19]]
+; CHECK-NEXT:    [[SHR29:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ADD30:%.*]] = or i32 [[SHR29]], [[CONV15]]
 ; CHECK-NEXT:    [[SUB39:%.*]] = or i32 [[SUB]], [[SUB20]]
 ; CHECK-NEXT:    [[CONV40:%.*]] = trunc i32 [[SUB39]] to i16
 ; CHECK-NEXT:    store i16 [[CONV40]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[SUB44:%.*]] = or i32 [[ADD4]], [[ADD30]]
+; CHECK-NEXT:    [[CONV45:%.*]] = trunc i32 [[SUB44]] to i16
+; CHECK-NEXT:    store i16 [[CONV45]], ptr [[ARRAYIDX18]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr i8, ptr null, i64 18
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[TMP11]], 0
+; CHECK-NEXT:    [[CONV3_112:%.*]] = zext i16 [[E]] to i32
+; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[CONV3_112]], 0
+; CHECK-NEXT:    [[SUB_1:%.*]] = or i32 0, [[CONV3_112]]
 ; CHECK-NEXT:    [[CONV15_1:%.*]] = sext i16 [[F]] to i32
+; CHECK-NEXT:    [[SHR_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_1:%.*]] = getelementptr i8, ptr null, i64 26
+; CHECK-NEXT:    [[CONV19_1:%.*]] = sext i16 [[G]] to i32
+; CHECK-NEXT:    [[SUB20_1:%.*]] = or i32 [[SHR_1]], [[CONV19_1]]
 ; CHECK-NEXT:    [[SHR29_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_1:%.*]] = or i32 [[SHR29_1]], [[CONV15_1]]
+; CHECK-NEXT:    [[SUB39_1:%.*]] = or i32 [[SUB_1]], [[SUB20_1]]
+; CHECK-NEXT:    [[CONV40_1:%.*]] = trunc i32 [[SUB39_1]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_1]], ptr [[ARRAYIDX2_1]], align 2
 ; CHECK-NEXT:    [[SUB44_1:%.*]] = or i32 [[ADD4_1]], [[ADD30_1]]
 ; CHECK-NEXT:    [[CONV45_1:%.*]] = trunc i32 [[SUB44_1]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_1]], ptr [[ARRAYIDX18_1]], align 2
 ; CHECK-NEXT:    [[CONV_213:%.*]] = zext i16 [[H]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr null, i64 20
+; CHECK-NEXT:    [[CONV3_214:%.*]] = zext i16 [[I]] to i32
 ; CHECK-NEXT:    [[ADD4_2:%.*]] = or i32 0, [[CONV_213]]
+; CHECK-NEXT:    [[SUB_2:%.*]] = or i32 0, [[CONV3_214]]
 ; CHECK-NEXT:    [[CONV15_2:%.*]] = sext i16 [[J]] to i32
+; CHECK-NEXT:    [[SHR_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_2:%.*]] = getelementptr i8, ptr null, i64 28
+; CHECK-NEXT:    [[CONV19_2:%.*]] = sext i16 [[K]] to i32
+; CHECK-NEXT:    [[SUB20_2:%.*]] = or i32 [[SHR_2]], [[CONV19_2]]
 ; CHECK-NEXT:    [[SHR29_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_2:%.*]] = or i32 [[SHR29_2]], [[CONV15_2]]
+; CHECK-NEXT:    [[SUB39_2:%.*]] = or i32 [[SUB_2]], [[SUB20_2]]
+; CHECK-NEXT:    [[CONV40_2:%.*]] = trunc i32 [[SUB39_2]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_2]], ptr [[ARRAYIDX2_2]], align 2
 ; CHECK-NEXT:    [[SUB44_2:%.*]] = or i32 [[ADD4_2]], [[ADD30_2]]
 ; CHECK-NEXT:    [[CONV45_2:%.*]] = trunc i32 [[SUB44_2]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_2]], ptr [[ARRAYIDX18_2]], align 2
 ; CHECK-NEXT:    [[CONV_315:%.*]] = zext i16 [[L]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr null, i64 22
+; CHECK-NEXT:    [[CONV3_316:%.*]] = zext i16 [[M]] to i32
 ; CHECK-NEXT:    [[ADD4_3:%.*]] = or i32 0, [[CONV_315]]
+; CHECK-NEXT:    [[SUB_3:%.*]] = or i32 0, [[CONV3_316]]
 ; CHECK-NEXT:    [[CONV15_3:%.*]] = sext i16 [[N]] to i32
+; CHECK-NEXT:    [[SHR_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_3:%.*]] = getelementptr i8, ptr null, i64 30
+; CHECK-NEXT:    [[CONV19_3:%.*]] = sext i16 [[O]] to i32
+; CHECK-NEXT:    [[SUB20_3:%.*]] = or i32 [[SHR_3]], [[CONV19_3]]
 ; CHECK-NEXT:    [[SHR29_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_3:%.*]] = or i32 [[SHR29_3]], [[CONV15_3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i16> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i16> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[ARRAYIDX2_1]], align 2
+; CHECK-NEXT:    [[SUB39_3:%.*]] = or i32 [[SUB_3]], [[SUB20_3]]
+; CHECK-NEXT:    [[CONV40_3:%.*]] = trunc i32 [[SUB39_3]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_3]], ptr [[ARRAYIDX2_3]], align 2
 ; CHECK-NEXT:    [[SUB44_3:%.*]] = or i32 [[ADD4_3]], [[ADD30_3]]
 ; CHECK-NEXT:    [[CONV45_3:%.*]] = trunc i32 [[SUB44_3]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_3]], ptr [[ARRAYIDX18_3]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index fcbe2d631ba8..3376fb8910b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
@@ -26,7 +26,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - Cost:            '6'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -36,7 +36,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -51,35 +51,39 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD16:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP17]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
@@ -129,7 +133,7 @@ for.body:
 ; YAML:      Function:        getelementptr_2x32
 ; YAML:     Args:
 ; YAML:        - String:          'SLP vectorized with cost '
-; YAML:        - Cost:            '4'
+; YAML:        - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -139,36 +143,42 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP13:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[T5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP5]]
+; CHECK-NEXT:    [[T7:%.*]] = add nsw i32 [[T4]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T7]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
-; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP13]] = add nsw <2 x i32> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 1826e19d8e73..4847149c87a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -14,7 +14,7 @@
 ; YAML-NEXT:  Function:        test_i16_extend
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-20'
+; YAML-NEXT:    - Cost:            '-16
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '5'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 9f5744b17cb7..929fb29a4a67 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -600,15 +600,27 @@ bb15:                                             ; preds = %bb15, %bb14
 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 10 to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index 9562e6d41f7c..fe3db7d462e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,168 +14,389 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
+; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
+; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
+; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
+; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
+; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
+; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
+; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
+; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
+; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
+; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
+; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
+; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
+; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
+; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
+; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
+; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
+; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
+; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
+; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
+; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
+; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
+; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
+; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
+; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
+; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
+; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
+; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
+; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
+; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
+; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
+; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
+; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
+; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
+; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
+; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
+; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
+; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
+; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
+; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
+; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
+; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
+; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
+; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
+; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
+; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
+; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
+; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
+; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
+; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
+; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
+; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
+; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
+; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
+; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
+; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
+; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
+; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
+; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
+; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
+; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
+; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
+; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
+; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
+; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
+; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
+; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
+; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
+; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
+; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
+; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
+; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
+; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
+; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
+; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
+; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
+; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
+; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
+; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
+; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
+; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
+; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
+; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
+; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
+; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
+; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
+; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
+; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
+; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
+; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
+; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
+; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
+; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
+; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
+; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
+; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
+; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
+; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
+; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
+; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
+; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
+; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
+; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
+; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
+; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
+; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
+; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
+; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
+; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
+; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
+; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
+; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
+; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
+; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
+; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
+; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
+; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP83]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <64 x i16> [[TMP84]], <64 x i16> [[TMP85]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <64 x i16> [[TMP86]], <64 x i16> [[TMP87]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP88]], <64 x i16> [[TMP89]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
+; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]]
+; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
+; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
+; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
+; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]]
+; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
+; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
+; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
+; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
+; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
+; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
+; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
+; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
+; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
+; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
+; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
+; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
+; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP82]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
@@ -580,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
+; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
+; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
+; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
+; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
+; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
+; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
+; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
+; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
+; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
+; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
+; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
+; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
+; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
+; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
+; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
+; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
+; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
+; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
+; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
+; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]]
+; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
index 4478eab7b827..15f4cffe7791 100644
--- a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index afaf6b98e508..094d60b66b39 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -90,14 +90,14 @@ entry:
 define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i64, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i64, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
@@ -131,14 +131,14 @@ entry:
 define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i32, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i32, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
index aeb82d800a2f..3c2f9e4d0ab5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
@@ -4,17 +4,17 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1
 ; CHECK-NEXT:    br label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr null, align 1
 ; CHECK-NEXT:    br label %[[TRAP:.*]]
 ; CHECK:       [[BB3:.*:]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr null, align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[TRAP]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
index 3cb81b72d26a..14ce08cb7aeb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
@@ -6,19 +6,36 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
 ; CHECK-LABEL: define void @should_vectorize_gep
 ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
+; CHECK-NEXT:    [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
+; CHECK-NEXT:    [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
+; CHECK-NEXT:    [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
+; CHECK-NEXT:    [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
+; CHECK-NEXT:    [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
+; CHECK-NEXT:    [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
+; CHECK-NEXT:    [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
+; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
+; CHECK-NEXT:    [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
 ; CHECK-NEXT:    call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index c728572313d7..cbf8bc9dcf8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
 ; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
new file mode 100644
index 000000000000..f90456297d7c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[PHI7]], i32 0
+; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB11:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ [[TMP1]], [[BB1]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+; CHECK:       bb15:
+; CHECK-NEXT:    ret void
+; CHECK:       bb16:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ], [ poison, [[BB25:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb25:
+; CHECK-NEXT:    switch i32 0, label [[BB16]] [
+; CHECK-NEXT:      i32 0, label [[BB14]]
+; CHECK-NEXT:      i32 1, label [[BB15:%.*]]
+; CHECK-NEXT:    ]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi2 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi3 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi4 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi5 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi6 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi7 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi8 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb11
+  ]
+
+bb9:
+  br label %bb11
+
+bb10:
+  br label %bb1
+
+bb11:
+  %phi12 = phi i32 [ 0, %bb9 ], [ %phi7, %bb1 ]
+  %phi13 = phi i32 [ 0, %bb9 ], [ undef, %bb1 ]
+  ret void
+
+bb14:
+  ret void
+
+bb15:
+  ret void
+
+bb16:
+  %phi17 = phi i32 [ %phi, %bb1 ], [ 0, %bb25 ]
+  %phi18 = phi i32 [ %phi2, %bb1 ], [ 0, %bb25 ]
+  %phi19 = phi i32 [ %phi3, %bb1 ], [ 0, %bb25 ]
+  %phi20 = phi i32 [ %phi4, %bb1 ], [ 0, %bb25 ]
+  %phi21 = phi i32 [ %phi5, %bb1 ], [ 0, %bb25 ]
+  %phi22 = phi i32 [ %phi6, %bb1 ], [ 0, %bb25 ]
+  %phi23 = phi i32 [ %phi7, %bb1 ], [ 0, %bb25 ]
+  %phi24 = phi i32 [ %phi8, %bb1 ], [ 0, %bb25 ]
+  ret void
+
+bb25:
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb15
+  ]
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
similarity index 91%
rename from llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
rename to llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
index 002b9a70255d..278e55c67f23 100644
--- a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %sptr, i64 %0) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 000000000000..0dac02b0bcc0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], splat (i32 16)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], splat (i32 16)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index 9b6511d0d828..d880c6b1783c 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) {
 ; AARCH64-LABEL: define <2 x i32> @test(
 ; AARCH64-SAME: i32 [[ARG:%.*]]) {
 ; AARCH64-NEXT:  bb:
-; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ARG]], i32 0
-; AARCH64-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer
-; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP2:%.*]] = or i32 [[ARG]], 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = mul i32 0, 1
 ; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
 ; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1
 ; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SROA/invariant-group.ll b/llvm/test/Transforms/SROA/invariant-group.ll
index 1be6f6e2fc32..c9c9e031ca95 100644
--- a/llvm/test/Transforms/SROA/invariant-group.ll
+++ b/llvm/test/Transforms/SROA/invariant-group.ll
@@ -11,10 +11,17 @@ declare i32 @somevalue()
 
 define void @f() {
 ; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[A1_I8_INV:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[A]])
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [[T]], ptr [[A]], i32 0, i32 1
 ; CHECK-NEXT:    [[SV1:%.*]] = call i32 @somevalue()
 ; CHECK-NEXT:    [[SV2:%.*]] = call i32 @somevalue()
-; CHECK-NEXT:    call void @h(i32 [[SV1]])
-; CHECK-NEXT:    call void @h(i32 [[SV2]])
+; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[SV2]], ptr [[A2]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[A2]], align 4
+; CHECK-NEXT:    call void @h(i32 [[V1]])
+; CHECK-NEXT:    call void @h(i32 [[V2]])
 ; CHECK-NEXT:    ret void
 ;
   %a = alloca %t
@@ -44,7 +51,7 @@ define void @g() {
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [[T]], ptr [[A]], i32 0, i32 1
 ; CHECK-NEXT:    [[SV1:%.*]] = call i32 @somevalue()
 ; CHECK-NEXT:    [[SV2:%.*]] = call i32 @somevalue()
-; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    store i32 [[SV2]], ptr [[A2]], align 4
 ; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[A2]], align 4
@@ -81,6 +88,9 @@ define void @g() {
 
 define void @store_and_launder() {
 ; CHECK-LABEL: @store_and_launder(
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[VALPTR]], align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
 ; CHECK-NEXT:    ret void
 ;
   %valptr = alloca i32, align 4
@@ -91,7 +101,10 @@ define void @store_and_launder() {
 
 define i32 @launder_and_load() {
 ; CHECK-LABEL: @launder_and_load(
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[VALPTR]], align 4
+; CHECK-NEXT:    ret i32 [[V2]]
 ;
   %valptr = alloca i32, align 4
   %barr = call ptr @llvm.launder.invariant.group.p0(ptr %valptr)
@@ -101,6 +114,9 @@ define i32 @launder_and_load() {
 
 define void @launder_and_ptr_arith() {
 ; CHECK-LABEL: @launder_and_ptr_arith(
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i32, ptr [[VALPTR]], i32 0
 ; CHECK-NEXT:    ret void
 ;
   %valptr = alloca i32, align 4
@@ -140,9 +156,13 @@ end:
 
 define void @partial_promotion_of_alloca() {
 ; CHECK-LABEL: @partial_promotion_of_alloca(
-; CHECK-NEXT:    [[STRUCT_PTR_SROA_2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store volatile i32 0, ptr [[STRUCT_PTR_SROA_2]], align 4
-; CHECK-NEXT:    [[STRUCT_PTR_SROA_2_0_STRUCT_PTR_SROA_2_4_LOAD_VAL:%.*]] = load volatile i32, ptr [[STRUCT_PTR_SROA_2]], align 4
+; CHECK-NEXT:    [[STRUCT_PTR:%.*]] = alloca [[T:%.*]], align 4
+; CHECK-NEXT:    [[FIELD_PTR:%.*]] = getelementptr inbounds [[T]], ptr [[STRUCT_PTR]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, ptr [[FIELD_PTR]], align 4
+; CHECK-NEXT:    [[VOLATILE_FIELD_PTR:%.*]] = getelementptr inbounds [[T]], ptr [[STRUCT_PTR]], i32 0, i32 1
+; CHECK-NEXT:    store volatile i32 0, ptr [[VOLATILE_FIELD_PTR]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[STRUCT_PTR]])
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load volatile i32, ptr [[VOLATILE_FIELD_PTR]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %struct_ptr = alloca %t, align 4
@@ -155,6 +175,61 @@ define void @partial_promotion_of_alloca() {
   ret void
 }
 
+define void @memcpy_after_laundering_alloca(ptr %ptr) {
+; CHECK-LABEL: @memcpy_after_laundering_alloca(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca { i64, i64 }, align 8
+; CHECK-NEXT:    [[LAUNDER:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[ALLOCA]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[LAUNDER]], ptr [[PTR:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca { i64, i64 }, align 8
+  %launder = call ptr @llvm.launder.invariant.group.p0(ptr %alloca)
+  call void @llvm.memcpy.p0.p0.i64(ptr %launder, ptr %ptr, i64 16, i1 false)
+  ret void
+}
+
+define void @memcpy_after_laundering_alloca_slices(ptr %ptr) {
+; CHECK-LABEL: @memcpy_after_laundering_alloca_slices(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca { [16 x i8], i64, [16 x i8] }, align 8
+; CHECK-NEXT:    [[LAUNDER:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[ALLOCA]])
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[LAUNDER]], i64 16
+; CHECK-NEXT:    store i64 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[LAUNDER]], ptr [[PTR:%.*]], i64 40, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca { [16 x i8], i64, [16 x i8] }, align 8
+  %launder = call ptr @llvm.launder.invariant.group.p0(ptr %alloca)
+  %gep = getelementptr i8, ptr %launder, i64 16
+  store i64 0, ptr %gep
+  call void @llvm.memcpy.p0.p0.i64(ptr %launder, ptr %ptr, i64 40, i1 false)
+  ret void
+}
+
+define void @test_agg_store() {
+; CHECK-LABEL: @test_agg_store(
+; CHECK-NEXT:    [[STRUCT_PTR:%.*]] = alloca [[T:%.*]], i64 1, align 4
+; CHECK-NEXT:    [[STRUCT_PTR_FRESH:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[STRUCT_PTR]])
+; CHECK-NEXT:    [[STRUCT:%.*]] = call [[T]] @[[MAKE_T:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
+; CHECK-NEXT:    store [[T]] [[STRUCT]], ptr [[STRUCT_PTR_FRESH]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[FIRST_PTR:%.*]] = getelementptr [[T]], ptr [[STRUCT_PTR_FRESH]], i32 0, i32 0
+; CHECK-NEXT:    [[FIRST:%.*]] = load i32, ptr [[FIRST_PTR]], align 4
+; CHECK-NEXT:    [[SECOND_PTR:%.*]] = getelementptr [[T]], ptr [[STRUCT_PTR_FRESH]], i32 0, i32 1
+; CHECK-NEXT:    [[SECOND:%.*]] = load i32, ptr [[SECOND_PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %struct_ptr = alloca %t, i64 1, align 4
+  %struct_ptr_fresh = call ptr @llvm.launder.invariant.group.p0(ptr %struct_ptr)
+  %struct = call %t @make_t()
+  store %t %struct, ptr %struct_ptr_fresh, align 4, !invariant.group !0
+  %first_ptr = getelementptr %t, ptr %struct_ptr_fresh, i32 0, i32 0
+  %first = load i32, ptr %first_ptr, align 4
+  %second_ptr = getelementptr %t, ptr %struct_ptr_fresh, i32 0, i32 1
+  %second = load i32, ptr %second_ptr, align 4
+  ret void
+}
+
+declare %t @make_t()
+
 declare void @use(ptr)
 
 !0 = !{}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index e6e5f5196d3d..5c035d29a7ea 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
 ; Scalarizing the load for multiple constant indices may not be profitable.
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
index 5f3229398792..ad5f5a7107c2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
@@ -13,8 +13,7 @@ define <2 x float> @maxnum(float %x, float %y) {
 ; AVX2-LABEL: define <2 x float> @maxnum(
 ; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AVX2-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison)
-; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> poison, float [[V_SCALAR]], i64 0
 ; AVX2-NEXT:    ret <2 x float> [[V]]
 ;
   %x.insert = insertelement <2 x float> poison, float %x, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/binop-scalarize.ll b/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
index 52a706a0b59a..bc07f8b08649 100644
--- a/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
@@ -20,3 +20,22 @@ define <4 x i8> @udiv_ub(i8 %x, i8 %y) {
   %v = udiv <4 x i8> %x.insert, %y.insert
   ret <4 x i8> %v
 }
+
+
+; Unfoldable constant expression may cause infinite loop between 
+; scalarizing insertelement and folding binop(insert(x,a,idx),insert(y,b,idx))
+@val = external hidden global ptr, align 8
+
+define <2 x i64> @pr153012(i64 %idx) #0 {
+; CHECK-LABEL: define <2 x i64> @pr153012(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = insertelement <2 x i64> <i64 5, i64 ptrtoint (ptr @val to i64)>, i64 [[IDX]], i32 0
+; CHECK-NEXT:    [[B:%.*]] = or disjoint <2 x i64> splat (i64 2), [[A]]
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+entry:
+  %a = insertelement <2 x i64> <i64 5, i64 ptrtoint (ptr @val to i64)>, i64 %idx, i32 0
+  %b = or disjoint <2 x i64> splat (i64 2), %a
+  ret <2 x i64> %b
+}
diff --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
index 9e43a28bf1e5..abd98a4dc64b 100644
--- a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -5,8 +5,7 @@ define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -19,8 +18,7 @@ define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -33,8 +31,7 @@ define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -46,8 +43,7 @@ define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -59,8 +55,7 @@ define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -72,8 +67,7 @@ define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -100,8 +94,7 @@ define <4 x float> @fabs_fixed(float %x) {
 ; CHECK-LABEL: define <4 x float> @fabs_fixed(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
@@ -113,8 +106,7 @@ define <vscale x 4 x float> @fabs_scalable(float %x) {
 ; CHECK-LABEL: define <vscale x 4 x float> @fabs_scalable(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
@@ -126,8 +118,7 @@ define <4 x float> @fma_fixed(float %x, float %y, float %z) {
 ; CHECK-LABEL: define <4 x float> @fma_fixed(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
@@ -141,8 +132,7 @@ define <vscale x 4 x float> @fma_scalable(float %x, float %y, float %z) {
 ; CHECK-LABEL: define <vscale x 4 x float> @fma_scalable(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
@@ -156,8 +146,7 @@ define <4 x float> @scalar_argument(float %x) {
 ; CHECK-LABEL: define <4 x float> @scalar_argument(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.powi.f32.i32(float [[X]], i32 42)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 42)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
diff --git a/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test b/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test
new file mode 100644
index 000000000000..12f14b5d58e1
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test
@@ -0,0 +1,134 @@
+## Test that bogus associative section symbols in executables are ignored.
+##
+## The executable contains two (bogus) associative section symbols, both for
+## (parts of) the .rdata section; one pointing at the .debug_info section
+## (which will be stripped out) and one pointing at a nonexistent section.
+##
+## Check that stripping does succeed, and that it doesn't end up removing
+## the .rdata section.
+
+# RUN: yaml2obj %s -o %t.in.exe
+
+# RUN: llvm-strip --strip-debug %t.in.exe -o %t.out.exe
+# RUN: llvm-readobj --sections %t.out.exe | FileCheck %s
+
+# CHECK: Name: .rdata
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       5368709120
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 4
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 5
+  MinorSubsystemVersion: 2
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [  ]
+  SizeOfStackReserve: 2097152
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     48
+    SectionData:     E806000000E802000000C3C3C30F1F00FFFFFFFFFFFFFFFF0000000000000000FFFFFFFFFFFFFFFF0000000000000000
+    SizeOfRawData:   512
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     4
+    SectionData:     '00000000'
+    SizeOfRawData:   512
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     4
+    SectionData:     '00000000'
+    SizeOfRawData:   512
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          11
+      NumberOfRelocations: 2
+      NumberOfLinenumbers: 0
+      CheckSum:        1703692295
+      Number:          1
+  - Name:            '.text$func1'
+    Value:           11
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        40735498
+      Number:          3
+      Selection:       IMAGE_COMDAT_SELECT_ANY
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+      Selection:       IMAGE_COMDAT_SELECT_ASSOCIATIVE
+  - Name:            '.text$func2'
+    Value:           12
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        40735498
+      Number:          4
+      Selection:       IMAGE_COMDAT_SELECT_ANY
+  - Name:            .rdata
+    Value:           1
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          4
+      Selection:       IMAGE_COMDAT_SELECT_ASSOCIATIVE
+  - Name:            .debug_info
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
new file mode 100644
index 000000000000..6a4927e4af2a
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
@@ -0,0 +1,47 @@
+/// Checks that various hexagon scenarios are handled correctly:
+///  - branch targets
+///  - endloops
+///  - inline-relocs
+///  - multi-insn bundles
+
+{
+  r6 = sub(r1, r0)
+  r7 = and(r4, #0x0)
+  if (p1) jump:t target1
+  if (p2) jump:nt target2
+}
+
+{
+  r8 = r7
+  r9 = add(r8, #0)
+  r10 = memw(r9)
+} :endloop0
+
+{ jump ##sym }
+
+target1:
+  nop
+
+target2:
+  nop
+
+// RUN: llvm-mc %s --triple=hexagon -filetype=obj | llvm-objdump -d -r - | FileCheck %s
+
+//      CHECK: 00000000 <.text>:
+// CHECK-NEXT:        0:       12 51 00 5c     5c005112 {      if (p1) jump:t 0x24 <target1>
+// CHECK-NEXT:        4:       14 42 00 5c     5c004214        if (p2) jump:nt 0x28 <target2>
+// CHECK-NEXT:        8:       06 41 20 f3     f3204106        r6 = sub(r1,r0)
+// CHECK-NEXT:        c:       07 c0 04 76     7604c007        r7 = and(r4,#0x0) }
+// CHECK-NEXT:       10:       08 80 67 70     70678008 {      r8 = r7
+// CHECK-NEXT:       14:       09 40 08 b0     b0084009        r9 = add(r8,#0x0)
+// CHECK-NEXT:       18:       0a c0 89 91     9189c00a        r10 = memw(r9+#0x0) }  :endloop0
+// CHECK-NEXT:       1c:       00 40 00 00     00004000 {      immext(#0x0)
+// CHECK-NEXT:                         0000001c:  R_HEX_B32_PCREL_X    sym
+// CHECK-NEXT:       20:       00 c0 00 58     5800c000        jump 0x1c <.text+0x1c> }
+// CHECK-NEXT:                         00000020:  R_HEX_B22_PCREL_X    sym+0x4
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000024 <target1>:
+// CHECK-NEXT:       24:       00 c0 00 7f     7f00c000 {      nop }
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000028 <target2>:
+// CHECK-NEXT:       28:       00 c0 00 7f     7f00c000 {      nop }
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index 607184e3b724..86727931067a 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -45,7 +45,11 @@ static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes,
     MCInst Inst;
 
     MCDisassembler::DecodeStatus S;
-    S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+    if (STI.getTargetTriple().getArch() == Triple::hexagon)
+      S = DisAsm.getInstructionBundle(Inst, Size, Data.slice(Index), Index,
+                                      nulls());
+    else
+      S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
     switch (S) {
     case MCDisassembler::Fail:
       SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index c5967cd090ee..74eb9033c8e2 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -693,6 +693,30 @@ class PrettyPrinter {
     } else
       OS << "\t<unknown>";
   }
+
+  virtual void emitPostInstructionInfo(formatted_raw_ostream &FOS,
+                                       const MCAsmInfo &MAI,
+                                       const MCSubtargetInfo &STI,
+                                       StringRef Comments,
+                                       LiveVariablePrinter &LVP) {
+    do {
+      if (!Comments.empty()) {
+        // Emit a line of comments.
+        StringRef Comment;
+        std::tie(Comment, Comments) = Comments.split('\n');
+        // MAI.getCommentColumn() assumes that instructions are printed at the
+        // position of 8, while getInstStartColumn() returns the actual
+        // position.
+        unsigned CommentColumn =
+            MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
+        FOS.PadToColumn(CommentColumn);
+        FOS << MAI.getCommentString() << ' ' << Comment;
+      }
+      LVP.printAfterInst(FOS);
+      FOS << "\n";
+    } while (!Comments.empty());
+    FOS.flush();
+  }
 };
 PrettyPrinter PrettyPrinterInst;
 
@@ -714,6 +738,35 @@ class HexagonPrettyPrinter : public PrettyPrinter {
       }
     }
   }
+
+  std::string getInstructionSeparator() const {
+    SmallString<40> Separator;
+    raw_svector_ostream OS(Separator);
+    if (ShouldClosePacket) {
+      OS << " }";
+      if (IsLoop0 || IsLoop1)
+        OS << "  ";
+      if (IsLoop0)
+        OS << (IsLoop1 ? ":endloop01" : ":endloop0");
+      else if (IsLoop1)
+        OS << ":endloop1";
+    }
+    OS << '\n';
+    return OS.str().str();
+  }
+
+  void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI, StringRef Comments,
+                               LiveVariablePrinter &LVP) override {
+    // Hexagon does not write anything to the comment stream, so we can just
+    // print the separator.
+    LVP.printAfterInst(FOS);
+    FOS << getInstructionSeparator();
+    FOS.flush();
+    if (ShouldClosePacket)
+      reset();
+  }
+
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
@@ -724,60 +777,64 @@ class HexagonPrettyPrinter : public PrettyPrinter {
     if (!MI) {
       printLead(Bytes, Address.Address, OS);
       OS << " <unknown>";
+      reset();
       return;
     }
-    std::string Buffer;
+
+    StringRef Preamble = IsStartOfBundle ? " { " : "   ";
+
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+    printLead(Bytes, Address.Address, OS);
+    OS << Preamble;
+    std::string Buf;
     {
-      raw_string_ostream TempStream(Buffer);
+      raw_string_ostream TempStream(Buf);
       IP.printInst(MI, Address.Address, "", STI, TempStream);
     }
-    StringRef Contents(Buffer);
-    // Split off bundle attributes
-    auto PacketBundle = Contents.rsplit('\n');
-    // Split off first instruction from the rest
-    auto HeadTail = PacketBundle.first.split('\n');
-    auto Preamble = " { ";
-    auto Separator = "";
-
-    // Hexagon's packets require relocations to be inline rather than
-    // clustered at the end of the packet.
-    std::vector<RelocationRef>::const_iterator RelCur = Rels->begin();
-    std::vector<RelocationRef>::const_iterator RelEnd = Rels->end();
-    auto PrintReloc = [&]() -> void {
-      while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) {
-        if (RelCur->getOffset() == Address.Address) {
-          printRelocation(OS, ObjectFilename, *RelCur, Address.Address, false);
-          return;
-        }
-        ++RelCur;
-      }
-    };
+    StringRef Contents(Buf);
+
+    auto Duplex = Contents.split('\v');
+    bool HasDuplex = !Duplex.second.empty();
+    if (HasDuplex) {
+      OS << Duplex.first;
+      OS << "; ";
+      OS << Duplex.second;
+    } else {
+      OS << Duplex.first;
+    }
 
-    while (!HeadTail.first.empty()) {
-      OS << Separator;
-      Separator = "\n";
-      if (SP && (PrintSource || PrintLines))
-        SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
-      printLead(Bytes, Address.Address, OS);
-      OS << Preamble;
-      Preamble = "   ";
-      StringRef Inst;
-      auto Duplex = HeadTail.first.split('\v');
-      if (!Duplex.second.empty()) {
-        OS << Duplex.first;
-        OS << "; ";
-        Inst = Duplex.second;
-      }
+    uint32_t Instruction = support::endian::read32le(Bytes.data());
+
+    uint32_t ParseMask = 0x0000c000;
+    uint32_t PacketEndMask = 0x0000c000;
+    uint32_t LoopEndMask = 0x00008000;
+    uint32_t ParseBits = Instruction & ParseMask;
+
+    if (ParseBits == LoopEndMask) {
+      if (IsStartOfBundle)
+        IsLoop0 = true;
       else
-        Inst = HeadTail.first;
-      OS << Inst;
-      HeadTail = HeadTail.second.split('\n');
-      if (HeadTail.first.empty())
-        OS << " } " << PacketBundle.second;
-      PrintReloc();
-      Bytes = Bytes.slice(4);
-      Address.Address += 4;
+        IsLoop1 = true;
     }
+
+    IsStartOfBundle = false;
+
+    if (ParseBits == PacketEndMask || HasDuplex)
+      ShouldClosePacket = true;
+  }
+
+private:
+  bool IsStartOfBundle = true;
+  bool IsLoop0 = false;
+  bool IsLoop1 = false;
+  bool ShouldClosePacket = false;
+
+  void reset() {
+    IsStartOfBundle = true;
+    IsLoop0 = false;
+    IsLoop1 = false;
+    ShouldClosePacket = false;
   }
 };
 HexagonPrettyPrinter HexagonPrettyPrinterInst;
@@ -1610,29 +1667,6 @@ static StringRef getSegmentName(const MachOObjectFile *MachO,
   return "";
 }
 
-static void emitPostInstructionInfo(formatted_raw_ostream &FOS,
-                                    const MCAsmInfo &MAI,
-                                    const MCSubtargetInfo &STI,
-                                    StringRef Comments,
-                                    LiveVariablePrinter &LVP) {
-  do {
-    if (!Comments.empty()) {
-      // Emit a line of comments.
-      StringRef Comment;
-      std::tie(Comment, Comments) = Comments.split('\n');
-      // MAI.getCommentColumn() assumes that instructions are printed at the
-      // position of 8, while getInstStartColumn() returns the actual position.
-      unsigned CommentColumn =
-          MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
-      FOS.PadToColumn(CommentColumn);
-      FOS << MAI.getCommentString() << ' ' << Comment;
-    }
-    LVP.printAfterInst(FOS);
-    FOS << '\n';
-  } while (!Comments.empty());
-  FOS.flush();
-}
-
 static void createFakeELFSections(ObjectFile &Obj) {
   assert(Obj.isELF());
   if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(&Obj))
@@ -2526,15 +2560,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         }
 
         assert(DT->Context->getAsmInfo());
-        emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
-                                *DT->SubtargetInfo, CommentStream.str(), LVP);
+        DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
+                                             *DT->SubtargetInfo,
+                                             CommentStream.str(), LVP);
         Comments.clear();
 
         if (BTF)
           printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
 
-        // Hexagon handles relocs in pretty printer
-        if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+        if (InlineRelocs) {
           while (findRel()) {
             // When --adjust-vma is used, update the address printed.
             printRelocation(FOS, Obj.getFileName(), *RelCur,
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 7a48105a1dc9..bf396499e35c 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/FloatingPointPredicateUtils.h"
 #include "llvm/AsmParser/Parser.h"
@@ -2208,6 +2209,41 @@ TEST_F(ComputeKnownFPClassTest, Constants) {
   }
 }
 
+TEST_F(ComputeKnownFPClassTest, fcmpImpliesClass_fabs_zero) {
+  parseAssembly("define float @test(float %x) {\n"
+                "  %A = call float @llvm.fabs.f32(float %x)\n"
+                "  ret float %A\n"
+                "}\n");
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OEQ, *F, A, fcZero)),
+            fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UEQ, *F, A, fcZero)),
+            fcZero | fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UNE, *F, A, fcZero)),
+            ~fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ONE, *F, A, fcZero)),
+            ~fcNan & ~fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ORD, *F, A, fcZero)),
+            ~fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UNO, *F, A, fcZero)),
+            fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OGT, *F, A, fcZero)),
+            fcSubnormal | fcNormal | fcInf);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UGT, *F, A, fcZero)),
+            fcSubnormal | fcNormal | fcInf | fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OGE, *F, A, fcZero)),
+            ~fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UGE, *F, A, fcZero)),
+            fcAllFlags);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OLT, *F, A, fcZero)),
+            fcNone);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ULT, *F, A, fcZero)),
+            fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OLE, *F, A, fcZero)),
+            fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ULE, *F, A, fcZero)),
+            fcZero | fcNan);
+}
+
 TEST_F(ValueTrackingTest, isNonZeroRecurrence) {
   parseAssembly(R"(
     define i1 @test(i8 %n, i8 %r) {
diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
index 8f7beea152ab..30b8bb61184b 100644
--- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
@@ -87,6 +87,7 @@ shared_library("libclang") {
     "Index_Internal.h",
     "Indexing.cpp",
     "Rewrite.cpp",
+    "Obsolete.cpp",
   ]
   if (host_os == "mac") {
     ldflags = [
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 2b1a9076afe4..ece4106de4ac 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 21
-llvm_version_minor = 0
+llvm_version_minor = 1
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index b5aa8edc03dc..520ff22dc6fb 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (21, 0, 0)
+__versioninfo__ = (21, 1, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index d3369abae70b..03eee0028b3c 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (21, 1, 0)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 3042fc2d77dd..54645d0c6369 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -169,9 +169,9 @@ set common_cmake_flags=^
 
 if "%force-msvc%" == "" (
   where /q clang-cl
-  if errorlevel 0 (
+  if %errorlevel% EQU 0 (
     where /q lld-link
-    if errorlevel 0 (
+    if %errorlevel% EQU 0 (
       set common_compiler_flags=%common_compiler_flags% -fuse-ld=lld
       
       set common_cmake_flags=%common_cmake_flags%^
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 66bef82586a3..0ac392cbed7b 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -123,7 +123,7 @@ export_sources() {
                 tar -C test-suite-$release$rc.src --strip-components=1 -xzf -
         fi
         echo "Creating tarball for test-suite ..."
-        tar --sort=name --owner=0 --group=0 \
+        XZ_OPT="-T0" tar --sort=name --owner=0 --group=0 \
             --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
             -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src
     fi
diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py
index 90c222d1175c..2b4e57d6348e 100755
--- a/llvm/utils/release/github-upload-release.py
+++ b/llvm/utils/release/github-upload-release.py
@@ -45,19 +45,39 @@ def create_release(repo, release, tag=None, name=None, message=None):
         # Note that these lines are not length limited because if we do so, GitHub
         # assumes that should be how it is laid out on the page. We want GitHub to
         # do the reflowing for us instead.
+        #
+        # Once all the atuomatic binary builds have completed, the HTML comments
+        # with UPPERCASE markers in them will be removed to reveal the download
+        # links later. Other lines are surrounded in <!-- --> for release uploaders
+        # to manually uncomment when they upload that package.
         message = dedent(
             """\
-LLVM {release} Release
+## LLVM {release} Release
 
-## Package Types
+<!-- AUTOMATIC_DOWNLOAD_LINKS_BEGIN
+* [Linux x86_64](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-X64.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-X64.tar.xz.jsonl))
+* [Linux Arm64](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-ARM64.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-ARM64.tar.xz.jsonl))
+AUTOMATIC_DOWNLOAD_LINKS_END -->
+<!-- * [Linux Armv7-a](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-armv7a-linux-gnueabihf.tar.gz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-armv7a-linux-gnueabihf.tar.gz.sig)) -->
 
-Each platform has one binary release package. The file name starts with either `LLVM-` or `clang+llvm-` and ends with the platform's name. For example, `LLVM-{release}-Linux-ARM64.tar.xz` contains LLVM binaries for Arm64 Linux.
+<!-- AUTOMATIC_DOWNLOAD_LINKS_BEGIN
+* [macOS Apple Silicon](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-ARM64.tar.xz) (ARM64) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-ARM64.tar.xz.jsonl))
+* [macOS Intel](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-X64.tar.xz) (x86-64) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-X64.tar.xz.jsonl))
+AUTOMATIC_DOWNLOAD_LINKS_END -->
+
+<!-- * Windows x64 (64-bit): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win64.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win64.exe.sig)), [archive](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-x86_64-pc-windows-msvc.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-x86_64-pc-windows-msvc.tar.xz.sig)) -->
+<!-- * Windows x86 (32-bit): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win32.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win32.exe.sig)) -->
+<!-- * Windows on Arm (ARM64): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-woa64.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-woa64.exe.sig)), [archive](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-aarch64-pc-windows-msvc.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-aarch64-pc-windows-msvc.tar.xz.sig)) -->
 
-Except for Windows. Where `LLVM-*.exe` is an installer intended for using LLVM as a toolchain and `clang+llvm-` contains the contents of the installer, plus libraries and tools not normally used in a toolchain. You most likely want the `LLVM-` installer, unless you are developing software which itself uses LLVM, in which case choose `clang+llvm-`.
+Download links will appear here once builds have completed. <!-- AUTOMATIC_DOWNLOAD_LINKS_PLACEHOLDER -->
+
+For any other variants of platform and architecture, check the full list of release packages at the bottom of this release page. If you do not find a release package for your platform, you may be able to find a community built package on the LLVM Discourse forum thread for this release. Remember that these are built by volunteers and may not always be available. If you rely on a platform or configuration that is not one of the defaults, we suggest you use the binaries that your platform provides, or build your own release packages.
+
+## Package Types
 
-If you do not find a release package for your platform, you may be able to find a community built package on the LLVM Discourse forum thread for this release. Remember that these are built by volunteers and may not always be available.
+Each platform has one binary release package. The file name starts with either `LLVM-` or `clang+llvm-` and ends with the platform's name. For example, `LLVM-{release}-Linux-ARM64.tar.xz` contains LLVM binaries for Arm64 Linux.
 
-If you rely on a platform or configuration that is not one of the defaults, we suggest you use the binaries that your platform provides, or build your own release packages.
+Except for Windows. Where `LLVM-*.exe` is an installer intended for using LLVM as a toolchain and the archive `clang+llvm-` contains the contents of the installer, plus libraries and tools not normally used in a toolchain. You most likely want the `LLVM-` installer, unless you are developing software which itself uses LLVM, in which case choose `clang+llvm-`.
 
 In addition, source archives are available:
 * `<sub-project>-{release}.src.tar.xz` are archives of the sources of specific sub-projects of `llvm-project` (except for `test-suite` which is an archive of the [LLVM Test Suite](https://github.com/llvm/llvm-test-suite)).
@@ -95,9 +115,35 @@ def upload_files(repo, release, files):
         print("Done")
 
 
+def uncomment_download_links(repo, release):
+    release = repo.get_release("llvmorg-{}".format(release))
+
+    new_message = []
+    to_remove = [
+        "AUTOMATIC_DOWNLOAD_LINKS_BEGIN",
+        "AUTOMATIC_DOWNLOAD_LINKS_END",
+        "AUTOMATIC_DOWNLOAD_LINKS_PLACEHOLDER",
+    ]
+    for line in release.body.splitlines():
+        for comment in to_remove:
+            if comment in line:
+                break
+        else:
+            new_message.append(line)
+
+    release.update_release(
+        name=release.title,
+        message="\n".join(new_message),
+        draft=release.draft,
+        prerelease=release.prerelease,
+    )
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    "command", type=str, choices=["create", "upload", "check-permissions"]
+    "command",
+    type=str,
+    choices=["create", "upload", "check-permissions", "uncomment_download_links"],
 )
 
 # All args
@@ -137,3 +183,5 @@ def upload_files(repo, release, files):
     create_release(llvm_repo, args.release)
 if args.command == "upload":
     upload_files(llvm_repo, args.release, args.files)
+if args.command == "uncomment_download_links":
+    uncomment_download_links(llvm_repo, args.release)
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index ff4269ed7acd..14eefb50ca71 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -388,6 +388,9 @@ function(add_mlir_library name)
 
   if(TARGET ${name})
     target_link_libraries(${name} INTERFACE ${LLVM_COMMON_LIBS})
+    if(ARG_INSTALL_WITH_TOOLCHAIN)
+      set_target_properties(${name} PROPERTIES MLIR_INSTALL_WITH_TOOLCHAIN TRUE)
+    endif()
     if(NOT ARG_DISABLE_INSTALL)
       add_mlir_library_install(${name})
     endif()
@@ -617,28 +620,29 @@ endfunction(add_mlir_aggregate)
 # This is usually done as part of add_mlir_library but is broken out for cases
 # where non-standard library builds can be installed.
 function(add_mlir_library_install name)
-  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
-  install(TARGETS ${name}
-    COMPONENT ${name}
-    ${export_to_mlirtargets}
-    LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
-    # Note that CMake will create a directory like:
-    #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
-    # and put object files there.
-    OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-  )
+  get_target_property(_install_with_toolchain ${name} MLIR_INSTALL_WITH_TOOLCHAIN)
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR _install_with_toolchain)
+    get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
+    install(TARGETS ${name}
+      COMPONENT ${name}
+      ${export_to_mlirtargets}
+      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      # Note that CMake will create a directory like:
+      #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
+      # and put object files there.
+      OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+    )
 
-  if (NOT LLVM_ENABLE_IDE)
-    add_llvm_install_targets(install-${name}
-                            DEPENDS ${name}
-                            COMPONENT ${name})
-  endif()
-  set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
+    if (NOT LLVM_ENABLE_IDE)
+      add_llvm_install_targets(install-${name}
+                              DEPENDS ${name}
+                              COMPONENT ${name})
+    endif()
+    set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
+    set_property(GLOBAL APPEND PROPERTY MLIR_EXPORTS ${name})
   endif()
-  set_property(GLOBAL APPEND PROPERTY MLIR_EXPORTS ${name})
 endfunction()
 
 # Declare an mlir library which is part of the public C-API.
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index 49862927caff..e364570c8b53 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -354,29 +354,7 @@ class DataFlowSolver {
 
   /// Erase any analysis state associated with the given lattice anchor.
   template <typename AnchorT>
-  void eraseState(AnchorT anchor) {
-    LatticeAnchor latticeAnchor(anchor);
-
-    // Update equivalentAnchorMap.
-    for (auto &&[TypeId, eqClass] : equivalentAnchorMap) {
-      if (!eqClass.contains(latticeAnchor)) {
-        continue;
-      }
-      llvm::EquivalenceClasses<LatticeAnchor>::member_iterator leaderIt =
-          eqClass.findLeader(latticeAnchor);
-
-      // Update analysis states with new leader if needed.
-      if (*leaderIt == latticeAnchor && ++leaderIt != eqClass.member_end()) {
-        analysisStates[*leaderIt][TypeId] =
-            std::move(analysisStates[latticeAnchor][TypeId]);
-      }
-
-      eqClass.erase(latticeAnchor);
-    }
-
-    // Update analysis states.
-    analysisStates.erase(latticeAnchor);
-  }
+  void eraseState(AnchorT anchor);
 
   /// Erase all analysis states.
   void eraseAllStates() {
@@ -560,6 +538,36 @@ class AnalysisState {
   friend class DataFlowSolver;
 };
 
+//===----------------------------------------------------------------------===//
+// DataFlowSolver definition
+//===----------------------------------------------------------------------===//
+// This method is defined outside `DataFlowSolver` and after `AnalysisState`
+// to prevent issues around `AnalysisState` being used before it is defined.
+template <typename AnchorT>
+void DataFlowSolver::eraseState(AnchorT anchor) {
+  LatticeAnchor latticeAnchor(anchor);
+
+  // Update equivalentAnchorMap.
+  for (auto &&[TypeId, eqClass] : equivalentAnchorMap) {
+    if (!eqClass.contains(latticeAnchor)) {
+      continue;
+    }
+    llvm::EquivalenceClasses<LatticeAnchor>::member_iterator leaderIt =
+        eqClass.findLeader(latticeAnchor);
+
+    // Update analysis states with new leader if needed.
+    if (*leaderIt == latticeAnchor && ++leaderIt != eqClass.member_end()) {
+      analysisStates[*leaderIt][TypeId] =
+          std::move(analysisStates[latticeAnchor][TypeId]);
+    }
+
+    eqClass.erase(latticeAnchor);
+  }
+
+  // Update analysis states.
+  analysisStates.erase(latticeAnchor);
+}
+
 //===----------------------------------------------------------------------===//
 // DataFlowAnalysis
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d082d2d9f758..18349d071bb2 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -65,8 +65,9 @@ using ForwardSliceOptions = SliceOptions;
 ///
 /// The implementation traverses the use chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `forwardSlice`, no
-/// need to traverse its uses again. Since use-def chains form a DAG, this
-/// terminates.
+/// need to traverse its uses again. In the presence of use-def cycles in a
+/// graph region, the traversal stops at the first operation that was already
+/// visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `forwardSlice` is filled with a
 /// postorder list of uses (i.e. a reverse topological order). To get a proper
@@ -114,8 +115,9 @@ void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 ///
 /// The implementation traverses the def chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `backwardSlice`, no
-/// need to traverse its definitions again. Since useuse-def chains form a DAG,
-/// this terminates.
+/// need to traverse its definitions again. In the presence of use-def cycles
+/// in a graph region, the traversal stops at the first operation that was
+/// already visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `backwardSlice` is filled with a
 /// postorder list of defs. This happens to be a topological order, from the
diff --git a/mlir/include/mlir/AsmParser/AsmParser.h b/mlir/include/mlir/AsmParser/AsmParser.h
index 33daf7ca26f4..f39b3bd853a2 100644
--- a/mlir/include/mlir/AsmParser/AsmParser.h
+++ b/mlir/include/mlir/AsmParser/AsmParser.h
@@ -53,7 +53,8 @@ parseAsmSourceFile(const llvm::SourceMgr &sourceMgr, Block *block,
 /// null terminated.
 Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context,
                          Type type = {}, size_t *numRead = nullptr,
-                         bool isKnownNullTerminated = false);
+                         bool isKnownNullTerminated = false,
+                         llvm::StringMap<Attribute> *attributesCache = nullptr);
 
 /// This parses a single MLIR type to an MLIR context if it was valid. If not,
 /// an error diagnostic is emitted to the context.
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
index 3f1041cb2510..243dbf081b99 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
@@ -62,9 +62,11 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto isEndomorphismOp = [reduction](Operation *op,
                                       std::optional<Operation *> referenceOp) {
     auto allReduceOp = llvm::dyn_cast<AllReduceOp>(op);
+    if (!allReduceOp)
+      return false;
     auto inType = cast<ShapedType>(allReduceOp.getInput().getType());
     auto outType = cast<ShapedType>(allReduceOp.getResult().getType());
-    if (!allReduceOp || inType.getElementType() != outType.getElementType() ||
+    if (inType.getElementType() != outType.getElementType() ||
         allReduceOp.getReduction() != reduction) {
       return false;
     }
@@ -87,9 +89,7 @@ void populateAllReduceEndomorphismSimplificationPatterns(
     return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() &&
            inType.getElementType() == refType.getElementType();
   };
-  auto isAlgebraicOp = [](Operation *op) {
-    return static_cast<bool>(llvm::dyn_cast<AlgebraicOp>(op));
-  };
+  auto isAlgebraicOp = [](Operation *op) { return isa<AlgebraicOp>(op); };
 
   using ConcreteEndomorphismSimplification = EndomorphismSimplification<
       std::decay_t<decltype(getEndomorphismOpOperand)>,
diff --git a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
index 6e72f7c23bdc..d66d757cb7a8 100644
--- a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -151,7 +151,7 @@ class OwningMemRef {
       AllocFunType allocFun = &::malloc,
       std::function<void(StridedMemRefType<T, Rank>)> freeFun =
           [](StridedMemRefType<T, Rank> descriptor) {
-            ::free(descriptor.data);
+            ::free(descriptor.basePtr);
           })
       : freeFunc(freeFun) {
     if (shapeAlloc.empty())
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 1c2c04e718bf..11af3b7d4d7b 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -1102,6 +1102,49 @@ inline raw_ostream &operator<<(raw_ostream &os, const Operation &op) {
   return os;
 }
 
+/// A wrapper class that allows for printing an operation with a set of flags,
+/// useful to act as a "stream modifier" to customize printing an operation
+/// with a stream using the operator<< overload, e.g.:
+///   llvm::dbgs() << OpWithFlags(op, OpPrintingFlags().skipRegions());
+class OpWithFlags {
+public:
+  OpWithFlags(Operation *op, OpPrintingFlags flags = {})
+      : op(op), theFlags(flags) {}
+  OpPrintingFlags &flags() { return theFlags; }
+  const OpPrintingFlags &flags() const { return theFlags; }
+
+private:
+  Operation *op;
+  OpPrintingFlags theFlags;
+  friend raw_ostream &operator<<(raw_ostream &os, const OpWithFlags &op);
+};
+
+inline raw_ostream &operator<<(raw_ostream &os,
+                               const OpWithFlags &opWithFlags) {
+  opWithFlags.op->print(os, opWithFlags.flags());
+  return os;
+}
+
+/// A wrapper class that allows for printing an operation with a custom
+/// AsmState, useful to act as a "stream modifier" to customize printing an
+/// operation with a stream using the operator<< overload, e.g.:
+///   llvm::dbgs() << OpWithState(op, OpPrintingFlags().skipRegions());
+class OpWithState {
+public:
+  OpWithState(Operation *op, AsmState &state) : op(op), theState(state) {}
+
+private:
+  Operation *op;
+  AsmState &theState;
+  friend raw_ostream &operator<<(raw_ostream &os, const OpWithState &op);
+};
+
+inline raw_ostream &operator<<(raw_ostream &os,
+                               const OpWithState &opWithState) {
+  opWithState.op->print(os, const_cast<OpWithState &>(opWithState).theState);
+  return os;
+}
+
 } // namespace mlir
 
 namespace llvm {
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index afeb784b85a1..9565eaf4da76 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -311,14 +311,14 @@ struct OpOrInterfaceRewritePatternBase : public RewritePattern {
 /// opposed to a raw Operation.
 template <typename SourceOp>
 struct OpRewritePattern
-    : public detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+    : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
 
   /// Patterns must specify the root operation name they match against, and can
   /// also specify the benefit of the pattern matching and a list of generated
   /// ops.
   OpRewritePattern(MLIRContext *context, PatternBenefit benefit = 1,
                    ArrayRef<StringRef> generatedNames = {})
-      : detail::OpOrInterfaceRewritePatternBase<SourceOp>(
+      : mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp>(
             SourceOp::getOperationName(), benefit, context, generatedNames) {}
 };
 
@@ -327,10 +327,10 @@ struct OpRewritePattern
 /// of a raw Operation.
 template <typename SourceOp>
 struct OpInterfaceRewritePattern
-    : public detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+    : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
 
   OpInterfaceRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
-      : detail::OpOrInterfaceRewritePatternBase<SourceOp>(
+      : mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp>(
             Pattern::MatchInterfaceOpTypeTag(), SourceOp::getInterfaceID(),
             benefit, context) {}
 };
diff --git a/mlir/include/mlir/Pass/PassOptions.h b/mlir/include/mlir/Pass/PassOptions.h
index e1f16c6158ad..0c71f78b52d3 100644
--- a/mlir/include/mlir/Pass/PassOptions.h
+++ b/mlir/include/mlir/Pass/PassOptions.h
@@ -377,7 +377,7 @@ class PassOptions : protected llvm::cl::SubCommand {
 ///   ListOption<int> someListFlag{*this, "flag-name", llvm::cl::desc("...")};
 /// };
 template <typename T>
-class PassPipelineOptions : public detail::PassOptions {
+class PassPipelineOptions : public virtual detail::PassOptions {
 public:
   /// Factory that parses the provided options and returns a unique_ptr to the
   /// struct.
diff --git a/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
index 6d62ae3dd43d..7d5c1d5cebb2 100644
--- a/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
+++ b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
@@ -27,6 +27,8 @@ namespace mlir {
 /// This is intended as the simplest and most lightweight pattern rewriter in
 /// cases when a simple walk gets the job done.
 ///
+/// The driver will skip unreachable blocks.
+///
 /// Note: Does not apply patterns to the given operation itself.
 void walkAndApplyPatterns(Operation *op,
                           const FrozenRewritePatternSet &patterns,
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index 6a12fe3acc2c..e1d7498f7be3 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -295,9 +295,45 @@ RunLivenessAnalysis::RunLivenessAnalysis(Operation *op) {
 
   loadBaselineAnalyses(solver);
   solver.load<LivenessAnalysis>(symbolTable);
-  LDBG("Initializing and running solver");
+  LLVM_DEBUG({ llvm::dbgs() << "Initializing and running solver\n"; });
   (void)solver.initializeAndRun(op);
-  LDBG("Dumping liveness state for op");
+  LLVM_DEBUG({
+    llvm::dbgs() << "RunLivenessAnalysis initialized for op: " << op->getName()
+                 << " check on unreachable code now:"
+                 << "\n";
+  });
+  // The framework doesn't visit operations in dead blocks, so we need to
+  // explicitly mark them as dead.
+  op->walk([&](Operation *op) {
+    if (op->getNumResults() == 0)
+      return;
+    for (auto result : llvm::enumerate(op->getResults())) {
+      if (getLiveness(result.value()))
+        continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "Result: " << result.index() << " of "
+                     << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                     << " has no liveness info (unreachable), mark dead"
+                     << "\n";
+      });
+      solver.getOrCreateState<Liveness>(result.value());
+    }
+    for (auto &region : op->getRegions()) {
+      for (auto &block : region) {
+        for (auto blockArg : llvm::enumerate(block.getArguments())) {
+          if (getLiveness(blockArg.value()))
+            continue;
+          LLVM_DEBUG({
+            llvm::dbgs() << "Block argument: " << blockArg.index() << " of "
+                         << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                         << " has no liveness info, mark dead"
+                         << "\n";
+          });
+          solver.getOrCreateState<Liveness>(blockArg.value());
+        }
+      }
+    }
+  });
 }
 
 const Liveness *RunLivenessAnalysis::getLiveness(Value val) {
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index e625f626d12f..5e342fd87773 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -19,12 +19,15 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include <cassert>
 #include <optional>
 
 using namespace mlir;
 using namespace mlir::dataflow;
 
+#define DEBUG_TYPE "dataflow"
+
 //===----------------------------------------------------------------------===//
 // AbstractSparseLattice
 //===----------------------------------------------------------------------===//
@@ -64,22 +67,56 @@ AbstractSparseForwardDataFlowAnalysis::initialize(Operation *top) {
 
 LogicalResult
 AbstractSparseForwardDataFlowAnalysis::initializeRecursively(Operation *op) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Initializing recursively for operation: " << op->getName()
+                 << "\n";
+  });
+
   // Initialize the analysis by visiting every owner of an SSA value (all
   // operations and blocks).
-  if (failed(visitOperation(op)))
+  if (failed(visitOperation(op))) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Failed to visit operation: " << op->getName() << "\n";
+    });
     return failure();
+  }
 
   for (Region &region : op->getRegions()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing region with " << region.getBlocks().size()
+                   << " blocks"
+                   << "\n";
+    });
     for (Block &block : region) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Processing block with " << block.getNumArguments()
+                     << " arguments"
+                     << "\n";
+      });
       getOrCreate<Executable>(getProgramPointBefore(&block))
           ->blockContentSubscribe(this);
       visitBlock(&block);
-      for (Operation &op : block)
-        if (failed(initializeRecursively(&op)))
+      for (Operation &op : block) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "Recursively initializing nested operation: "
+                       << op.getName() << "\n";
+        });
+        if (failed(initializeRecursively(&op))) {
+          LLVM_DEBUG({
+            llvm::dbgs() << "Failed to initialize nested operation: "
+                         << op.getName() << "\n";
+          });
           return failure();
+        }
+      }
     }
   }
 
+  LLVM_DEBUG({
+    llvm::dbgs()
+        << "Successfully completed recursive initialization for operation: "
+        << op->getName() << "\n";
+  });
   return success();
 }
 
@@ -409,11 +446,29 @@ static MutableArrayRef<OpOperand> operandsToOpOperands(OperandRange &operands) {
 
 LogicalResult
 AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Visiting operation: " << op->getName() << " with "
+                 << op->getNumOperands() << " operands and "
+                 << op->getNumResults() << " results"
+                 << "\n";
+  });
+
   // If we're in a dead block, bail out.
   if (op->getBlock() != nullptr &&
-      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))->isLive())
+      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))
+           ->isLive()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Operation is in dead block, bailing out"
+                   << "\n";
+    });
     return success();
+  }
 
+  LLVM_DEBUG({
+    llvm::dbgs() << "Creating lattice elements for " << op->getNumOperands()
+                 << " operands and " << op->getNumResults() << " results"
+                 << "\n";
+  });
   SmallVector<AbstractSparseLattice *> operandLattices =
       getLatticeElements(op->getOperands());
   SmallVector<const AbstractSparseLattice *> resultLattices =
@@ -422,11 +477,21 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // Block arguments of region branch operations flow back into the operands
   // of the parent op
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing RegionBranchOpInterface operation"
+                   << "\n";
+    });
     visitRegionSuccessors(branch, operandLattices);
     return success();
   }
 
   if (auto branch = dyn_cast<BranchOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing BranchOpInterface operation with "
+                   << op->getNumSuccessors() << " successors"
+                   << "\n";
+    });
+
     // Block arguments of successor blocks flow back into our operands.
 
     // We remember all operands not forwarded to any block in a BitVector.
@@ -463,6 +528,10 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // For function calls, connect the arguments of the entry blocks to the
   // operands of the call op that are forwarded to these arguments.
   if (auto call = dyn_cast<CallOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing CallOpInterface operation"
+                   << "\n";
+    });
     Operation *callableOp = call.resolveCallableInTable(&symbolTable);
     if (auto callable = dyn_cast_or_null<CallableOpInterface>(callableOp)) {
       // Not all operands of a call op forward to arguments. Such operands are
@@ -513,6 +582,10 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // of this op itself and the operands of the terminators of the regions of
   // this op.
   if (auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing RegionBranchTerminatorOpInterface operation"
+                   << "\n";
+    });
     if (auto branch = dyn_cast<RegionBranchOpInterface>(op->getParentOp())) {
       visitRegionSuccessorsFromTerminator(terminator, branch);
       return success();
@@ -520,12 +593,25 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   }
 
   if (op->hasTrait<OpTrait::ReturnLike>()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing ReturnLike operation"
+                   << "\n";
+    });
     // Going backwards, the operands of the return are derived from the
     // results of all CallOps calling this CallableOp.
-    if (auto callable = dyn_cast<CallableOpInterface>(op->getParentOp()))
+    if (auto callable = dyn_cast<CallableOpInterface>(op->getParentOp())) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Callable parent found, visiting callable operation"
+                     << "\n";
+      });
       return visitCallableOperation(op, callable, operandLattices);
+    }
   }
 
+  LLVM_DEBUG({
+    llvm::dbgs() << "Using default visitOperationImpl for operation: "
+                 << op->getName() << "\n";
+  });
   return visitOperationImpl(op, operandLattices, resultLattices);
 }
 
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 36a9812bd797..991c71e3f689 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -26,7 +26,8 @@
 using namespace mlir;
 
 static void
-getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
+getForwardSliceImpl(Operation *op, DenseSet<Operation *> &visited,
+                    SetVector<Operation *> *forwardSlice,
                     const SliceOptions::TransitiveFilter &filter = nullptr) {
   if (!op)
     return;
@@ -40,20 +41,41 @@ getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
   for (Region &region : op->getRegions())
     for (Block &block : region)
       for (Operation &blockOp : block)
-        if (forwardSlice->count(&blockOp) == 0)
-          getForwardSliceImpl(&blockOp, forwardSlice, filter);
-  for (Value result : op->getResults()) {
-    for (Operation *userOp : result.getUsers())
-      if (forwardSlice->count(userOp) == 0)
-        getForwardSliceImpl(userOp, forwardSlice, filter);
-  }
+        if (forwardSlice->count(&blockOp) == 0) {
+          // We don't have to check if the 'blockOp' is already visited because
+          // there cannot be a traversal path from this nested op to the parent
+          // and thus a cycle cannot be closed here. We still have to mark it
+          // as visited to stop before visiting this operation again if it is
+          // part of a cycle.
+          visited.insert(&blockOp);
+          getForwardSliceImpl(&blockOp, visited, forwardSlice, filter);
+          visited.erase(&blockOp);
+        }
+
+  for (Value result : op->getResults())
+    for (Operation *userOp : result.getUsers()) {
+      // A cycle can only occur within a basic block (not across regions or
+      // basic blocks) because the parent region must be a graph region, graph
+      // regions are restricted to always have 0 or 1 blocks, and there cannot
+      // be a def-use edge from a nested operation to an operation in an
+      // ancestor region. Therefore, we don't have to but may use the same
+      // 'visited' set across regions/blocks as long as we remove operations
+      // from the set again when the DFS traverses back from the leaf to the
+      // root.
+      if (forwardSlice->count(userOp) == 0 && visited.insert(userOp).second)
+        getForwardSliceImpl(userOp, visited, forwardSlice, filter);
+
+      visited.erase(userOp);
+    }
 
   forwardSlice->insert(op);
 }
 
 void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
                            const ForwardSliceOptions &options) {
-  getForwardSliceImpl(op, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  getForwardSliceImpl(op, visited, forwardSlice, options.filter);
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
     // want it in the results.
@@ -69,8 +91,12 @@ void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
 
 void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
                            const SliceOptions &options) {
-  for (Operation *user : root.getUsers())
-    getForwardSliceImpl(user, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  for (Operation *user : root.getUsers()) {
+    visited.insert(user);
+    getForwardSliceImpl(user, visited, forwardSlice, options.filter);
+    visited.erase(user);
+  }
 
   // Reverse to get back the actual topological order.
   // std::reverse does not work out of the box on SetVector and I want an
@@ -80,6 +106,7 @@ void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 }
 
 static LogicalResult getBackwardSliceImpl(Operation *op,
+                                          DenseSet<Operation *> &visited,
                                           SetVector<Operation *> *backwardSlice,
                                           const BackwardSliceOptions &options) {
   if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
@@ -93,8 +120,12 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 
   auto processValue = [&](Value value) {
     if (auto *definingOp = value.getDefiningOp()) {
-      if (backwardSlice->count(definingOp) == 0)
-        return getBackwardSliceImpl(definingOp, backwardSlice, options);
+      if (backwardSlice->count(definingOp) == 0 &&
+          visited.insert(definingOp).second)
+        return getBackwardSliceImpl(definingOp, visited, backwardSlice,
+                                    options);
+
+      visited.erase(definingOp);
     } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
       if (options.omitBlockArguments)
         return success();
@@ -107,7 +138,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
       if (parentOp && backwardSlice->count(parentOp) == 0) {
         if (parentOp->getNumRegions() == 1 &&
             llvm::hasSingleElement(parentOp->getRegion(0).getBlocks())) {
-          return getBackwardSliceImpl(parentOp, backwardSlice, options);
+          return getBackwardSliceImpl(parentOp, visited, backwardSlice,
+                                      options);
         }
       }
     } else {
@@ -145,7 +177,10 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 LogicalResult mlir::getBackwardSlice(Operation *op,
                                      SetVector<Operation *> *backwardSlice,
                                      const BackwardSliceOptions &options) {
-  LogicalResult result = getBackwardSliceImpl(op, backwardSlice, options);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  LogicalResult result =
+      getBackwardSliceImpl(op, visited, backwardSlice, options);
 
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
diff --git a/mlir/lib/AsmParser/DialectSymbolParser.cpp b/mlir/lib/AsmParser/DialectSymbolParser.cpp
index 9f4a87a6a02d..416d8eb5f40e 100644
--- a/mlir/lib/AsmParser/DialectSymbolParser.cpp
+++ b/mlir/lib/AsmParser/DialectSymbolParser.cpp
@@ -89,6 +89,7 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
     nestedPunctuation.pop_back();
     return success();
   };
+  const char *curBufferEnd = state.lex.getBufferEnd();
   do {
     // Handle code completions, which may appear in the middle of the symbol
     // body.
@@ -98,6 +99,12 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
       break;
     }
 
+    if (curBufferEnd == curPtr) {
+      if (!nestedPunctuation.empty())
+        return emitPunctError();
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    }
+
     char c = *curPtr++;
     switch (c) {
     case '\0':
@@ -238,6 +245,15 @@ static Symbol parseExtendedSymbol(Parser &p, AsmParserState *asmState,
       return nullptr;
   }
 
+  if constexpr (std::is_same_v<Symbol, Attribute>) {
+    auto &cache = p.getState().symbols.attributesCache;
+    auto cacheIt = cache.find(symbolData);
+    // Skip cached attribute if it has type.
+    if (cacheIt != cache.end() && !p.getToken().is(Token::colon))
+      return cacheIt->second;
+
+    return cache[symbolData] = createSymbol(dialectName, symbolData, loc);
+  }
   return createSymbol(dialectName, symbolData, loc);
 }
 
@@ -330,6 +346,7 @@ Type Parser::parseExtendedType() {
 template <typename T, typename ParserFn>
 static T parseSymbol(StringRef inputStr, MLIRContext *context,
                      size_t *numReadOut, bool isKnownNullTerminated,
+                     llvm::StringMap<Attribute> *attributesCache,
                      ParserFn &&parserFn) {
   // Set the buffer name to the string being parsed, so that it appears in error
   // diagnostics.
@@ -341,6 +358,9 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
   SymbolState aliasState;
+  if (attributesCache)
+    aliasState.attributesCache = *attributesCache;
+
   ParserConfig config(context);
   ParserState state(sourceMgr, config, aliasState, /*asmState=*/nullptr,
                     /*codeCompleteContext=*/nullptr);
@@ -351,6 +371,11 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
   if (!symbol)
     return T();
 
+  if constexpr (std::is_same_v<T, Attribute>) {
+    if (attributesCache)
+      *attributesCache = state.symbols.attributesCache;
+  }
+
   // Provide the number of bytes that were read.
   Token endTok = parser.getToken();
   size_t numRead =
@@ -367,13 +392,15 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
 
 Attribute mlir::parseAttribute(StringRef attrStr, MLIRContext *context,
                                Type type, size_t *numRead,
-                               bool isKnownNullTerminated) {
+                               bool isKnownNullTerminated,
+                               llvm::StringMap<Attribute> *attributesCache) {
   return parseSymbol<Attribute>(
-      attrStr, context, numRead, isKnownNullTerminated,
+      attrStr, context, numRead, isKnownNullTerminated, attributesCache,
       [type](Parser &parser) { return parser.parseAttribute(type); });
 }
 Type mlir::parseType(StringRef typeStr, MLIRContext *context, size_t *numRead,
                      bool isKnownNullTerminated) {
   return parseSymbol<Type>(typeStr, context, numRead, isKnownNullTerminated,
+                           /*attributesCache=*/nullptr,
                            [](Parser &parser) { return parser.parseType(); });
 }
diff --git a/mlir/lib/AsmParser/Lexer.cpp b/mlir/lib/AsmParser/Lexer.cpp
index 751bd63e537f..8f53529823e2 100644
--- a/mlir/lib/AsmParser/Lexer.cpp
+++ b/mlir/lib/AsmParser/Lexer.cpp
@@ -37,6 +37,18 @@ Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
              AsmParserCodeCompleteContext *codeCompleteContext)
     : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
   auto bufferID = sourceMgr.getMainFileID();
+
+  // Check to see if the main buffer contains the last buffer, and if so the
+  // last buffer should be used as main file for parsing.
+  if (sourceMgr.getNumBuffers() > 1) {
+    unsigned lastFileID = sourceMgr.getNumBuffers();
+    const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
+    const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
+    if (main->getBufferStart() <= last->getBufferStart() &&
+        main->getBufferEnd() >= last->getBufferEnd()) {
+      bufferID = lastFileID;
+    }
+  }
   curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
   curPtr = curBuffer.begin();
 
@@ -71,6 +83,7 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }
 
 Token Lexer::lexToken() {
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     const char *tokStart = curPtr;
 
@@ -78,6 +91,9 @@ Token Lexer::lexToken() {
     if (tokStart == codeCompleteLoc)
       return formToken(Token::code_complete, tokStart);
 
+    if (tokStart == curBufferEnd)
+      return formToken(Token::eof, tokStart);
+
     // Lex the next token.
     switch (*curPtr++) {
     default:
@@ -102,7 +118,7 @@ Token Lexer::lexToken() {
     case 0:
       // This may either be a nul character in the source file or may be the EOF
       // marker that llvm::MemoryBuffer guarantees will be there.
-      if (curPtr - 1 == curBuffer.end())
+      if (curPtr - 1 == curBufferEnd)
         return formToken(Token::eof, tokStart);
       continue;
 
@@ -259,7 +275,11 @@ void Lexer::skipComment() {
   assert(*curPtr == '/');
   ++curPtr;
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
+    if (curPtr == curBufferEnd)
+      return;
+
     switch (*curPtr++) {
     case '\n':
     case '\r':
@@ -267,7 +287,7 @@ void Lexer::skipComment() {
       return;
     case 0:
       // If this is the end of the buffer, end the comment.
-      if (curPtr - 1 == curBuffer.end()) {
+      if (curPtr - 1 == curBufferEnd) {
         --curPtr;
         return;
       }
@@ -405,6 +425,7 @@ Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
 Token Lexer::lexString(const char *tokStart) {
   assert(curPtr[-1] == '"');
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     // Check to see if there is a code completion location within the string. In
     // these cases we generate a completion location and place the currently
@@ -419,7 +440,7 @@ Token Lexer::lexString(const char *tokStart) {
     case 0:
       // If this is a random nul character in the middle of a string, just
       // include it.  If it is the end of file, then it is an error.
-      if (curPtr - 1 != curBuffer.end())
+      if (curPtr - 1 != curBufferEnd)
         continue;
       [[fallthrough]];
     case '\n':
diff --git a/mlir/lib/AsmParser/Lexer.h b/mlir/lib/AsmParser/Lexer.h
index 4085a9b73854..670444eb1f5b 100644
--- a/mlir/lib/AsmParser/Lexer.h
+++ b/mlir/lib/AsmParser/Lexer.h
@@ -40,6 +40,9 @@ class Lexer {
   /// Returns the start of the buffer.
   const char *getBufferBegin() { return curBuffer.data(); }
 
+  /// Returns the end of the buffer.
+  const char *getBufferEnd() { return curBuffer.end(); }
+
   /// Return the code completion location of the lexer, or nullptr if there is
   /// none.
   const char *getCodeCompleteLoc() const { return codeCompleteLoc; }
diff --git a/mlir/lib/AsmParser/ParserState.h b/mlir/lib/AsmParser/ParserState.h
index 159058a18fa4..aa53032107cb 100644
--- a/mlir/lib/AsmParser/ParserState.h
+++ b/mlir/lib/AsmParser/ParserState.h
@@ -40,6 +40,9 @@ struct SymbolState {
 
   /// A map from unique integer identifier to DistinctAttr.
   DenseMap<uint64_t, DistinctAttr> distinctAttributes;
+
+  /// A map from unique string identifier to Attribute.
+  llvm::StringMap<Attribute> attributesCache;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 44458d010c6c..0f9744343377 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -895,6 +895,10 @@ class AttrTypeReader {
   SmallVector<AttrEntry> attributes;
   SmallVector<TypeEntry> types;
 
+  /// The map of cached attributes, used to avoid re-parsing the same
+  /// attribute multiple times.
+  llvm::StringMap<Attribute> attributesCache;
+
   /// A location used for error emission.
   Location fileLoc;
 
@@ -1235,7 +1239,7 @@ LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
         ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
   else
     result = ::parseAttribute(asmStr, context, Type(), &numRead,
-                              /*isKnownNullTerminated=*/true);
+                              /*isKnownNullTerminated=*/true, &attributesCache);
   if (!result)
     return failure();
 
diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
index 0df91a243d07..523dc463a0da 100644
--- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
+++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
@@ -582,6 +582,7 @@ LogicalResult WhileLowering::matchAndRewrite(WhileOp whileOp,
   // block. This should be reconsidered if we allow break/continue in SCF.
   rewriter.setInsertionPointToEnd(before);
   auto condOp = cast<ConditionOp>(before->getTerminator());
+  SmallVector<Value> args = llvm::to_vector(condOp.getArgs());
   rewriter.replaceOpWithNewOp<cf::CondBranchOp>(condOp, condOp.getCondition(),
                                                 after, condOp.getArgs(),
                                                 continuation, ValueRange());
@@ -593,7 +594,7 @@ LogicalResult WhileLowering::matchAndRewrite(WhileOp whileOp,
 
   // Replace the op with values "yielded" from the "before" region, which are
   // visible by dominance.
-  rewriter.replaceOp(whileOp, condOp.getArgs());
+  rewriter.replaceOp(whileOp, args);
 
   return success();
 }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 00c31a1500e1..dbea050b554e 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -4273,14 +4273,15 @@ LogicalResult scf::IndexSwitchOp::verify() {
              << "see yield operation here";
     }
     for (auto [idx, result, operand] :
-         llvm::zip(llvm::seq<unsigned>(0, getNumResults()), getResultTypes(),
-                   yield.getOperandTypes())) {
-      if (result == operand)
+         llvm::enumerate(getResultTypes(), yield.getOperands())) {
+      if (!operand)
+        return yield.emitOpError() << "operand " << idx << " is null\n";
+      if (result == operand.getType())
         continue;
       return (emitOpError("expected result #")
               << idx << " of each region to be " << result)
                  .attachNote(yield.getLoc())
-             << name << " returns " << operand << " here";
+             << name << " returns " << operand.getType() << " here";
     }
     return success();
   };
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 1cded38c4419..36059e553d30 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -272,7 +272,7 @@ std::optional<int64_t> constantTripCount(OpFoldResult lb, OpFoldResult ub,
   if (!ubConstant)
     return std::nullopt;
   std::optional<int64_t> stepConstant = getConstantIntValue(step);
-  if (!stepConstant)
+  if (!stepConstant || *stepConstant == 0)
     return std::nullopt;
 
   return llvm::divideCeilSigned(*ubConstant - *lbConstant, *stepConstant);
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 8a08a157b25d..7d615bfc1298 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4237,28 +4237,35 @@ class StridedSliceBroadcast final
     auto dstVecType = llvm::cast<VectorType>(op.getType());
     unsigned dstRank = dstVecType.getRank();
     unsigned rankDiff = dstRank - srcRank;
-    // Check if the most inner dimensions of the source of the broadcast are the
-    // same as the destination of the extract. If this is the case we can just
-    // use a broadcast as the original dimensions are untouched.
-    bool lowerDimMatch = true;
+    // Source dimensions can be broadcasted (1 -> n with n > 1) or sliced
+    // (n -> m with n > m). If they are originally both broadcasted *and*
+    // sliced, this can be simplified to just broadcasting.
+    bool needsSlice = false;
     for (unsigned i = 0; i < srcRank; i++) {
-      if (srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
-        lowerDimMatch = false;
+      if (srcVecType.getDimSize(i) != 1 &&
+          srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
+        needsSlice = true;
         break;
       }
     }
     Value source = broadcast.getSource();
-    // If the inner dimensions don't match, it means we need to extract from the
-    // source of the orignal broadcast and then broadcast the extracted value.
-    // We also need to handle degenerated cases where the source is effectively
-    // just a single scalar.
-    bool isScalarSrc = (srcRank == 0 || srcVecType.getNumElements() == 1);
-    if (!lowerDimMatch && !isScalarSrc) {
+    if (needsSlice) {
+      SmallVector<int64_t> offsets =
+          getI64SubArray(op.getOffsets(), /*dropFront=*/rankDiff);
+      SmallVector<int64_t> sizes =
+          getI64SubArray(op.getSizes(), /*dropFront=*/rankDiff);
+      for (unsigned i = 0; i < srcRank; i++) {
+        if (srcVecType.getDimSize(i) == 1) {
+          // In case this dimension was broadcasted *and* sliced, the offset
+          // and size need to be updated now that there is no broadcast before
+          // the slice.
+          offsets[i] = 0;
+          sizes[i] = 1;
+        }
+      }
       source = rewriter.create<ExtractStridedSliceOp>(
-          op->getLoc(), source,
-          getI64SubArray(op.getOffsets(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getSizes(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getStrides(), /* dropFront=*/rankDiff));
+          op->getLoc(), source, offsets, sizes,
+          getI64SubArray(op.getStrides(), /*dropFront=*/rankDiff));
     }
     rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), source);
     return success();
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
index 26d40ac3a38f..cb9d21bf3e61 100644
--- a/mlir/lib/IR/AttributeDetail.h
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -19,11 +19,9 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Support/StorageUniquer.h"
-#include "mlir/Support/ThreadLocalCache.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/Allocator.h"
+#include <mutex>
 
 namespace mlir {
 namespace detail {
@@ -396,27 +394,30 @@ class DistinctAttributeUniquer {
                                               Attribute referencedAttr);
 };
 
-/// An allocator for distinct attribute storage instances. It uses thread local
-/// bump pointer allocators stored in a thread local cache to ensure the storage
-/// is freed after the destruction of the distinct attribute allocator.
-class DistinctAttributeAllocator {
+/// An allocator for distinct attribute storage instances. Uses a synchronized
+/// BumpPtrAllocator to ensure thread-safety. The allocated storage is deleted
+/// when the DistinctAttributeAllocator is destroyed.
+class DistinctAttributeAllocator final {
 public:
   DistinctAttributeAllocator() = default;
-
   DistinctAttributeAllocator(DistinctAttributeAllocator &&) = delete;
   DistinctAttributeAllocator(const DistinctAttributeAllocator &) = delete;
   DistinctAttributeAllocator &
   operator=(const DistinctAttributeAllocator &) = delete;
 
-  /// Allocates a distinct attribute storage using a thread local bump pointer
-  /// allocator to enable synchronization free parallel allocations.
   DistinctAttrStorage *allocate(Attribute referencedAttr) {
-    return new (allocatorCache.get().Allocate<DistinctAttrStorage>())
+    std::scoped_lock<std::mutex> guard(allocatorMutex);
+    return new (allocator.Allocate<DistinctAttrStorage>())
         DistinctAttrStorage(referencedAttr);
-  }
+  };
 
 private:
-  ThreadLocalCache<llvm::BumpPtrAllocator> allocatorCache;
+  /// Used to allocate distict attribute storages. The managed memory is freed
+  /// automatically when the allocator instance is destroyed.
+  llvm::BumpPtrAllocator allocator;
+
+  /// Used to lock access to the allocator.
+  std::mutex allocatorMutex;
 };
 } // namespace detail
 } // namespace mlir
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 716d9c85a377..2d5381d43f86 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/ThreadPool.h"
@@ -883,6 +884,8 @@ int OperationName::UnregisteredOpModel::getOpPropertyByteSize() {
 void OperationName::UnregisteredOpModel::initProperties(
     OperationName opName, OpaqueProperties storage, OpaqueProperties init) {
   new (storage.as<Attribute *>()) Attribute();
+  if (init)
+    *storage.as<Attribute *>() = *init.as<Attribute *>();
 }
 void OperationName::UnregisteredOpModel::deleteProperties(
     OpaqueProperties prop) {
diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp
index b048ff946239..c6fb1d737d50 100644
--- a/mlir/lib/Pass/PassCrashRecovery.cpp
+++ b/mlir/lib/Pass/PassCrashRecovery.cpp
@@ -414,14 +414,19 @@ struct FileReproducerStream : public mlir::ReproducerStream {
 
 LogicalResult PassManager::runWithCrashRecovery(Operation *op,
                                                 AnalysisManager am) {
+  const bool threadingEnabled = getContext()->isMultithreadingEnabled();
   crashReproGenerator->initialize(getPasses(), op, verifyPasses);
 
   // Safely invoke the passes within a recovery context.
   LogicalResult passManagerResult = failure();
   llvm::CrashRecoveryContext recoveryContext;
-  recoveryContext.RunSafelyOnThread(
-      [&] { passManagerResult = runPasses(op, am); });
+  const auto runPassesFn = [&] { passManagerResult = runPasses(op, am); };
+  if (threadingEnabled)
+    recoveryContext.RunSafelyOnThread(runPassesFn);
+  else
+    recoveryContext.RunSafely(runPassesFn);
   crashReproGenerator->finalize(op, passManagerResult);
+
   return passManagerResult;
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3185f28fe668..f8ea6ee07447 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3395,6 +3395,7 @@ static llvm::omp::Directive convertCancellationConstructType(
   case omp::ClauseCancellationConstructType::Taskgroup:
     return llvm::omp::Directive::OMPD_taskgroup;
   }
+  llvm_unreachable("Unhandled cancellation construct type");
 }
 
 static LogicalResult
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 31e0caa76811..9f2a5c761b5c 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -508,8 +508,7 @@ performActions(raw_ostream &os,
            << "bytecode version while not emitting bytecode";
   AsmState asmState(op.get(), OpPrintingFlags(), /*locationMap=*/nullptr,
                     &fallbackResourceMap);
-  op.get()->print(os, asmState);
-  os << '\n';
+  os << OpWithState(op.get(), asmState) << '\n';
   return success();
 }
 
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 608bdcb94817..1d7e2135e23e 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -258,16 +258,22 @@ static SmallVector<OpOperand *> operandsToOpOperands(OperandRange operands) {
 static void processSimpleOp(Operation *op, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
-  LDBG("Processing simple op: " << *op);
   if (!isMemoryEffectFree(op) || hasLive(op->getResults(), nonLiveSet, la)) {
-    LDBG("Simple op is not memory effect free or has live results, skipping: "
-         << *op);
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "Simple op is not memory effect free or has live results, "
+             "preserving it: "
+          << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+    });
     return;
   }
 
-  LDBG("Simple op has all dead results and is memory effect free, scheduling "
-       "for removal: "
-       << *op);
+  LLVM_DEBUG({
+    llvm::dbgs() << "Simple op has all dead results and is memory effect free, "
+                    "scheduling "
+                    "for removal: "
+                 << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+  });
   cl.operations.push_back(op);
   collectNonLiveValues(nonLiveSet, op->getResults(),
                        BitVector(op->getNumResults(), true));
@@ -345,8 +351,6 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
   // since it forwards only to non-live value(s) (%1#1).
   Operation *lastReturnOp = funcOp.back().getTerminator();
   size_t numReturns = lastReturnOp->getNumOperands();
-  if (numReturns == 0)
-    return;
   BitVector nonLiveRets(numReturns, true);
   for (SymbolTable::SymbolUse use : uses) {
     Operation *callOp = use.getUser();
@@ -368,6 +372,8 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
   cl.functions.push_back({funcOp, nonLiveArgs, nonLiveRets});
 
   // Do (5) and (6).
+  if (numReturns == 0)
+    return;
   for (SymbolTable::SymbolUse use : uses) {
     Operation *callOp = use.getUser();
     assert(isa<CallOpInterface>(callOp) && "expected a call-like user");
@@ -727,19 +733,53 @@ static void processBranchOp(BranchOpInterface branchOp, RunLivenessAnalysis &la,
 /// Removes dead values collected in RDVFinalCleanupList.
 /// To be run once when all dead values have been collected.
 static void cleanUpDeadVals(RDVFinalCleanupList &list) {
+  LLVM_DEBUG({ llvm::dbgs() << "Starting cleanup of dead values...\n"; });
+
   // 1. Operations
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.operations.size() << " operations"
+                 << "\n";
+  });
   for (auto &op : list.operations) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing operation: "
+                   << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+    });
     op->dropAllUses();
     op->erase();
   }
 
   // 2. Values
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.values.size() << " values"
+                 << "\n";
+  });
   for (auto &v : list.values) {
+    LLVM_DEBUG(
+        { llvm::dbgs() << "Dropping all uses of value: " << v << "\n"; });
     v.dropAllUses();
   }
 
   // 3. Functions
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.functions.size() << " functions"
+                 << "\n";
+  });
   for (auto &f : list.functions) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Cleaning up function: "
+                   << f.funcOp.getOperation()->getName() << "\n";
+    });
+    LLVM_DEBUG({
+      llvm::dbgs() << "  Erasing " << f.nonLiveArgs.count()
+                   << " non-live arguments"
+                   << "\n";
+    });
+    LLVM_DEBUG({
+      llvm::dbgs() << "  Erasing " << f.nonLiveRets.count()
+                   << " non-live return values"
+                   << "\n";
+    });
     // Some functions may not allow erasing arguments or results. These calls
     // return failure in such cases without modifying the function, so it's okay
     // to proceed.
@@ -748,44 +788,99 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
   }
 
   // 4. Operands
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.operands.size() << " operand lists"
+                 << "\n";
+  });
   for (OperationToCleanup &o : list.operands) {
-    if (o.op->getNumOperands() > 0)
+    if (o.op->getNumOperands() > 0) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Erasing " << o.nonLive.count()
+                     << " non-live operands from operation: "
+                     << OpWithFlags(o.op, OpPrintingFlags().skipRegions())
+                     << "\n";
+      });
       o.op->eraseOperands(o.nonLive);
+    }
   }
 
   // 5. Results
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.results.size() << " result lists"
+                 << "\n";
+  });
   for (auto &r : list.results) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << r.nonLive.count()
+                   << " non-live results from operation: "
+                   << OpWithFlags(r.op, OpPrintingFlags().skipRegions())
+                   << "\n";
+    });
     dropUsesAndEraseResults(r.op, r.nonLive);
   }
 
   // 6. Blocks
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.blocks.size()
+                 << " block argument lists"
+                 << "\n";
+  });
   for (auto &b : list.blocks) {
     // blocks that are accessed via multiple codepaths processed once
     if (b.b->getNumArguments() != b.nonLiveArgs.size())
       continue;
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << b.nonLiveArgs.count()
+                   << " non-live arguments from block: " << b.b << "\n";
+    });
     // it iterates backwards because erase invalidates all successor indexes
     for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
       if (!b.nonLiveArgs[i])
         continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "  Erasing block argument " << i << ": "
+                     << b.b->getArgument(i) << "\n";
+      });
       b.b->getArgument(i).dropAllUses();
       b.b->eraseArgument(i);
     }
   }
 
   // 7. Successor Operands
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.successorOperands.size()
+                 << " successor operand lists"
+                 << "\n";
+  });
   for (auto &op : list.successorOperands) {
     SuccessorOperands successorOperands =
         op.branch.getSuccessorOperands(op.successorIndex);
     // blocks that are accessed via multiple codepaths processed once
     if (successorOperands.size() != op.nonLiveOperands.size())
       continue;
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << op.nonLiveOperands.count()
+                   << " non-live successor operands from successor "
+                   << op.successorIndex << " of branch: "
+                   << OpWithFlags(op.branch, OpPrintingFlags().skipRegions())
+                   << "\n";
+    });
     // it iterates backwards because erase invalidates all successor indexes
     for (int i = successorOperands.size() - 1; i >= 0; --i) {
       if (!op.nonLiveOperands[i])
         continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "  Erasing successor operand " << i << ": "
+                     << successorOperands[i] << "\n";
+      });
       successorOperands.erase(i);
     }
   }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Finished cleanup of dead values"
+                 << "\n";
+  });
 }
 
 struct RemoveDeadValues : public impl::RemoveDeadValuesBase<RemoveDeadValues> {
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index b82d85041394..0a2a0cc1d5c7 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -870,7 +871,18 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
 
     ctx->executeAction<GreedyPatternRewriteIteration>(
         [&] {
-          continueRewrites = processWorklist();
+          continueRewrites = false;
+
+          // Erase unreachable blocks
+          // Operations like:
+          //   %add = arith.addi %add, %add : i64
+          // are legal in unreachable code. Unfortunately many patterns would be
+          // unsafe to apply on such IR and can lead to crashes or infinite
+          // loops.
+          continueRewrites |=
+              succeeded(eraseUnreachableBlocks(rewriter, region));
+
+          continueRewrites |= processWorklist();
 
           // After applying patterns, make sure that the CFG of each of the
           // regions is kept up to date.
diff --git a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
index ee5c642c943c..baa76b9aab4e 100644
--- a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
@@ -13,11 +13,13 @@
 #include "mlir/Transforms/WalkPatternRewriteDriver.h"
 
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Rewrite/PatternApplicator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -25,6 +27,26 @@
 
 namespace mlir {
 
+// Find all reachable blocks in the region and add them to the visitedBlocks
+// set.
+static void findReachableBlocks(Region &region,
+                                DenseSet<Block *> &reachableBlocks) {
+  Block *entryBlock = &region.front();
+  reachableBlocks.insert(entryBlock);
+  // Traverse the CFG and add all reachable blocks to the blockList.
+  SmallVector<Block *> worklist({entryBlock});
+  while (!worklist.empty()) {
+    Block *block = worklist.pop_back_val();
+    Operation *terminator = &block->back();
+    for (Block *successor : terminator->getSuccessors()) {
+      if (reachableBlocks.contains(successor))
+        continue;
+      worklist.push_back(successor);
+      reachableBlocks.insert(successor);
+    }
+  }
+}
+
 namespace {
 struct WalkAndApplyPatternsAction final
     : tracing::ActionImpl<WalkAndApplyPatternsAction> {
@@ -88,20 +110,112 @@ void walkAndApplyPatterns(Operation *op,
   PatternApplicator applicator(patterns);
   applicator.applyDefaultCostModel();
 
+  // Iterator on all reachable operations in the region.
+  // Also keep track if we visited the nested regions of the current op
+  // already to drive the post-order traversal.
+  struct RegionReachableOpIterator {
+    RegionReachableOpIterator(Region *region) : region(region) {
+      regionIt = region->begin();
+      if (regionIt != region->end())
+        blockIt = regionIt->begin();
+      if (!llvm::hasSingleElement(*region))
+        findReachableBlocks(*region, reachableBlocks);
+    }
+    // Advance the iterator to the next reachable operation.
+    void advance() {
+      assert(regionIt != region->end());
+      hasVisitedRegions = false;
+      if (blockIt == regionIt->end()) {
+        ++regionIt;
+        while (regionIt != region->end() &&
+               !reachableBlocks.contains(&*regionIt))
+          ++regionIt;
+        if (regionIt != region->end())
+          blockIt = regionIt->begin();
+        return;
+      }
+      ++blockIt;
+      if (blockIt != regionIt->end()) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "Incrementing block iterator, next op: "
+                       << OpWithFlags(&*blockIt,
+                                      OpPrintingFlags().skipRegions())
+                       << "\n";
+        });
+      }
+    }
+    // The region we're iterating over.
+    Region *region;
+    // The Block currently being iterated over.
+    Region::iterator regionIt;
+    // The Operation currently being iterated over.
+    Block::iterator blockIt;
+    // The set of blocks that are reachable in the current region.
+    DenseSet<Block *> reachableBlocks;
+    // Whether we've visited the nested regions of the current op already.
+    bool hasVisitedRegions = false;
+  };
+
+  // Worklist of regions to visit to drive the post-order traversal.
+  SmallVector<RegionReachableOpIterator> worklist;
+
+  LLVM_DEBUG(
+      { llvm::dbgs() << "Starting walk-based pattern rewrite driver\n"; });
   ctx->executeAction<WalkAndApplyPatternsAction>(
       [&] {
+        // Perform a post-order traversal of the regions, visiting each
+        // reachable operation.
         for (Region &region : op->getRegions()) {
-          region.walk([&](Operation *visitedOp) {
-            LLVM_DEBUG(llvm::dbgs() << "Visiting op: "; visitedOp->print(
-                llvm::dbgs(), OpPrintingFlags().skipRegions());
-                       llvm::dbgs() << "\n";);
+          assert(worklist.empty());
+          if (region.empty())
+            continue;
+
+          // Prime the worklist with the entry block of this region.
+          worklist.push_back({&region});
+          while (!worklist.empty()) {
+            RegionReachableOpIterator &it = worklist.back();
+            if (it.regionIt == it.region->end()) {
+              // We're done with this region.
+              worklist.pop_back();
+              continue;
+            }
+            if (it.blockIt == it.regionIt->end()) {
+              // We're done with this block.
+              it.advance();
+              continue;
+            }
+            Operation *op = &*it.blockIt;
+            // If we haven't visited the nested regions of this op yet,
+            // enqueue them.
+            if (!it.hasVisitedRegions) {
+              it.hasVisitedRegions = true;
+              for (Region &nestedRegion : llvm::reverse(op->getRegions())) {
+                if (nestedRegion.empty())
+                  continue;
+                worklist.push_back({&nestedRegion});
+              }
+            }
+            // If we're not at the back of the worklist, we've enqueued some
+            // nested region for processing. We'll come back to this op later
+            // (post-order)
+            if (&it != &worklist.back())
+              continue;
+
+            // Preemptively increment the iterator, in case the current op
+            // would be erased.
+            it.advance();
+
+            LLVM_DEBUG({
+              llvm::dbgs() << "Visiting op: "
+                           << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                           << "\n";
+            });
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-            erasedListener.visitedOp = visitedOp;
+            erasedListener.visitedOp = op;
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-            if (succeeded(applicator.matchAndRewrite(visitedOp, rewriter))) {
-              LLVM_DEBUG(llvm::dbgs() << "\tOp matched and rewritten\n";);
-            }
-          });
+            if (succeeded(applicator.matchAndRewrite(op, rewriter)))
+              LLVM_DEBUG({ llvm::dbgs() << "\tOp matched and rewritten\n"; });
+          }
         }
       },
       {op});
diff --git a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
index a89a0f4084e9..3748be74eb0f 100644
--- a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
+++ b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
@@ -283,3 +283,23 @@ func.func @test_10_negative() -> (i32) {
   %0:2 = func.call @private_1() : () -> (i32, i32)
   return %0#0 : i32
 }
+
+// -----
+
+// Test that we correctly set a liveness value for operations in dead block.
+// These won't be visited by the dataflow framework so the analysis need to
+// explicitly manage them.
+// CHECK-LABEL: test_tag: dead_block_cmpi:
+// CHECK-NEXT: operand #0: not live
+// CHECK-NEXT: operand #1: not live
+// CHECK-NEXT: result #0: not live
+func.func @dead_block() {
+  %false = arith.constant false
+  %zero = arith.constant 0 : i64
+  cf.cond_br %false, ^bb1, ^bb4
+  ^bb1:
+    %3 = arith.cmpi eq, %zero, %zero  {tag = "dead_block_cmpi"} : i64
+    cf.br ^bb1
+  ^bb4:
+    return
+}
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 0848a924b9d9..c53667a98cfb 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -292,3 +292,26 @@ func.func @slicing_test_multiple_return(%arg0: index) -> (index, index) {
   %0:2 = "slicing-test-op"(%arg0, %arg0): (index, index) -> (index, index)
   return %0#0, %0#1 : index, index
 }
+
+// -----
+
+// FWD-LABEL: graph_region_with_cycle
+// BWD-LABEL: graph_region_with_cycle
+// FWDBWD-LABEL: graph_region_with_cycle
+func.func @graph_region_with_cycle() {
+  test.isolated_graph_region {
+    // FWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // FWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    
+    // BWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // BWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    %0 = "slicing-test-op"(%1) : (i1) -> i1
+    %1 = "slicing-test-op"(%0) : (i1) -> i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 3d5a46d13e59..cf570beb14b9 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -3363,3 +3363,18 @@ func.func @bf16_fma(%arg0: vector<32x32x32xbf16>, %arg1: vector<32x32x32xbf16>,
     }
   }
 #-}
+
+// CHECK-LABEL: func @unreachable()
+// CHECK-NEXT: return
+// CHECK-NOT: arith
+func.func @unreachable() {
+  return
+^unreachable:
+  %c1_i64 = arith.constant 1 : i64
+  // This self referencing operation is legal in an unreachable block.
+  // Many patterns are unsafe with respect to this kind of situation,
+  // check that we don't infinite loop here.
+  %add = arith.addi %add, %c1_i64 : i64
+  cf.br ^unreachable
+}
+
diff --git a/mlir/test/Dialect/Mesh/simplifications.mlir b/mlir/test/Dialect/Mesh/simplifications.mlir
index 2540fbf9510c..e955f4c13425 100644
--- a/mlir/test/Dialect/Mesh/simplifications.mlir
+++ b/mlir/test/Dialect/Mesh/simplifications.mlir
@@ -165,3 +165,15 @@ func.func @all_reduce_arith_minsi_endomorphism(
   // CHECK: return %[[ALL_REDUCE_RES]]
   return %2 : tensor<5xi32>
 }
+
+// Ensure this case without endomorphism op not crash.
+// CHECK-LABEL: func.func @no_endomorphism_op
+func.func @no_endomorphism_op(%arg0: tensor<2xi64>) -> i64 {
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  // CHECK: tensor.extract
+  %extracted = tensor.extract %arg0[%c0] : tensor<2xi64>
+  // CHECK: arith.maxsi
+  %0 = arith.maxsi %extracted, %c1_i64 : i64
+  return %0 : i64
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 8ba8013d008a..b15fabdd29c6 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1925,3 +1925,16 @@ func.func @index_switch_fold_no_res() {
 
 // CHECK-LABEL: func.func @index_switch_fold_no_res()
 //  CHECK-NEXT: "test.op"() : () -> ()
+
+// -----
+
+// CHECK-LABEL: func @scf_for_all_step_size_0()
+//       CHECK:   scf.forall (%{{.*}}) = (0) to (1) step (0)
+func.func @scf_for_all_step_size_0()  {
+  %x = arith.constant 0 : index
+  scf.forall (%i, %j) = (0, 4) to (1, 5) step (%x, 8) {
+    vector.print %x : index
+    scf.forall.in_parallel {}
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 12187dd18012..ea2343efd246 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1379,6 +1379,21 @@ func.func @extract_strided_broadcast4(%arg0: f32) -> vector<1x4xf32> {
 
 // -----
 
+// Check the case where the same dimension is both broadcasted and sliced 
+// CHECK-LABEL: func @extract_strided_broadcast5
+//  CHECK-SAME: (%[[ARG:.+]]: vector<2x1xf32>)
+//       CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : vector<2x1xf32> to vector<2x4xf32>
+//       CHECK: return %[[V]]
+func.func @extract_strided_broadcast5(%arg0: vector<2x1xf32>) -> vector<2x4xf32> {
+ %0 = vector.broadcast %arg0 : vector<2x1xf32> to vector<2x8xf32>
+ %1 = vector.extract_strided_slice %0
+      {offsets = [0, 4], sizes = [2, 4], strides = [1, 1]}
+      : vector<2x8xf32> to vector<2x4xf32>
+  return %1 : vector<2x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: consecutive_shape_cast
 //       CHECK:   %[[C:.*]] = vector.shape_cast %{{.*}} : vector<16xf16> to vector<4x4xf16>
 //  CHECK-NEXT:   return %[[C]] : vector<4x4xf16>
diff --git a/mlir/test/IR/recursive-distinct-attr.mlir b/mlir/test/IR/recursive-distinct-attr.mlir
new file mode 100644
index 000000000000..5afb5c59e0fc
--- /dev/null
+++ b/mlir/test/IR/recursive-distinct-attr.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt -emit-bytecode %s | mlir-opt --mlir-print-debuginfo | FileCheck %s
+
+// Verify that the distinct attribute which is used transitively
+// through two aliases does not end up duplicated when round-tripped
+// through bytecode.
+
+// CHECK: distinct[0]
+// CHECK-NOT: distinct[1]
+#attr_ugly = #test<attr_ugly begin distinct[0]<> end>
+#attr_ugly1 = #test<attr_ugly begin #attr_ugly end>
+
+module attributes {test.alias = #attr_ugly, test.alias1 = #attr_ugly1} {
+}
\ No newline at end of file
diff --git a/mlir/test/IR/test-clone.mlir b/mlir/test/IR/test-clone.mlir
index 0c07593aef32..f723efc1a2c5 100644
--- a/mlir/test/IR/test-clone.mlir
+++ b/mlir/test/IR/test-clone.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" --split-input-file | FileCheck %s
 
 module {
   func.func @fixpoint(%arg1 : i32) -> i32 {
@@ -18,7 +18,8 @@ module {
 // CHECK-NEXT: notifyOperationInserted: test.yield
 // CHECK-NEXT: notifyOperationInserted: func.return
 
-// CHECK:   func @fixpoint(%[[arg0:.+]]: i32) -> i32 {
+// CHECK-LABEL: func @fixpoint
+// CHECK-SAME:       (%[[arg0:.+]]: i32) -> i32 {
 // CHECK-NEXT:     %[[i0:.+]] = "test.use"(%[[arg0]]) ({
 // CHECK-NEXT:       %[[r2:.+]] = "test.use2"(%[[arg0]]) ({
 // CHECK-NEXT:         "test.yield2"(%[[arg0]]) : (i32) -> ()
@@ -33,3 +34,33 @@ module {
 // CHECK-NEXT:     }) : (i32) -> i32
 // CHECK-NEXT:     return %[[i1]] : i32
 // CHECK-NEXT:   }
+
+// -----
+
+func.func @clone_unregistered_with_attrs() {
+  "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = 3.14 : f32}> : () -> ()
+  "unregistered.bar"() : () -> ()
+  "unregistered.empty_dict"() <{}> : () -> ()
+  "unregistered.complex"() <{
+    array = [1, 2, 3],
+    dict = {key1 = 42 : i32, key2 = "value"},
+    nested = {inner = {deep = 100 : i64}}
+  }> : () -> ()
+  return
+}
+
+// CHECK: notifyOperationInserted: unregistered.foo
+// CHECK-NEXT: notifyOperationInserted: unregistered.bar
+// CHECK-NEXT: notifyOperationInserted: unregistered.empty_dict
+// CHECK-NEXT: notifyOperationInserted: unregistered.complex
+// CHECK-NEXT: notifyOperationInserted: func.return
+
+// CHECK-LABEL:  func @clone_unregistered_with_attrs() {
+// CHECK-NEXT:     "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = [[PI:.+]] : f32}> : () -> ()
+// CHECK-NEXT:     "unregistered.bar"() : () -> ()
+// CHECK-NEXT:     "unregistered.empty_dict"() <{}> : () -> ()
+// CHECK-NEXT:     "unregistered.complex"() <{array = [1, 2, 3], dict = {key1 = 42 : i32, key2 = "value"}, nested = {inner = {deep = 100 : i64}}}> : () -> ()
+// CHECK-NEXT:     "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = [[PI]] : f32}> : () -> ()
+// CHECK-NEXT:     "unregistered.bar"() : () -> ()
+// CHECK-NEXT:     "unregistered.empty_dict"() <{}> : () -> ()
+// CHECK-NEXT:     "unregistered.complex"() <{array = [1, 2, 3], dict = {key1 = 42 : i32, key2 = "value"}, nested = {inner = {deep = 100 : i64}}}> : () -> ()
diff --git a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
index 02f7e60671c9..c3063416b036 100644
--- a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
+++ b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
@@ -40,12 +40,12 @@ func.func @move_before(%cond : i1) {
 }
 
 // Check that the driver handles rewriter.moveAfter. In this case, we expect
-// the moved op to be visited only once since walk uses `make_early_inc_range`.
+// the moved op to be visited twice.
 // CHECK-LABEL: func.func @move_after(
 // CHECK: scf.if
 // CHECK: }
 // CHECK: "test.move_after_parent_op"
-// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> ()
+// CHECK: "test.any_attr_of_i32_str"() <{attr = 2 : i32}> : () -> ()
 // CHECK: return
 func.func @move_after(%cond : i1) {
   scf.if %cond {
@@ -119,3 +119,23 @@ func.func @erase_nested_block() -> i32 {
   }): () -> (i32)
   return %a : i32
 }
+
+
+// CHECK-LABEL: func.func @unreachable_replace_with_new_op
+// CHECK: "test.new_op"
+// CHECK: "test.replace_with_new_op"
+// CHECK-SAME: unreachable
+// CHECK: "test.new_op"
+func.func @unreachable_replace_with_new_op() {
+  "test.br"()[^bb1] : () -> ()
+^bb1:
+  %a = "test.replace_with_new_op"() : () -> (i32)
+  "test.br"()[^end] : () -> () // Test jumping over the unreachable block is visited as well.
+^unreachable:
+  %b = "test.replace_with_new_op"() {test.unreachable} : () -> (i32)
+  return
+^end:
+  %c = "test.replace_with_new_op"() : () -> (i32)
+  return
+}
+
diff --git a/mlir/test/Pass/pipeline-options-parsing.mlir b/mlir/test/Pass/pipeline-options-parsing.mlir
index 9385d353faf9..03ac38ea1611 100644
--- a/mlir/test/Pass/pipeline-options-parsing.mlir
+++ b/mlir/test/Pass/pipeline-options-parsing.mlir
@@ -13,6 +13,7 @@
 // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(builtin.module(func.func(test-options-pass{list=3}), func.func(test-options-pass{enum=one list=1,2,3,4 string=foo"bar"baz})))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_6 %s
 // RUN: mlir-opt %s -verify-each=false '-test-options-super-pass-pipeline=super-list={{enum=zero list=1 string=foo},{enum=one list=2 string="bar"},{enum=two list=3 string={baz}}}' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
 // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(func.func(test-options-super-pass{list={{enum=zero list={1} string=foo },{enum=one list={2} string=bar },{enum=two list={3} string=baz }}}))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
+// RUN: mlir-opt %s -verify-each=false -test-options-super-set-ab-pipeline='foo=true bar=false' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_11 %s
 
 
 // This test checks that lists-of-nested-options like 'option1={...},{....}' can be parsed
@@ -106,3 +107,12 @@
 // CHECK_10-NEXT:     test-options-pass{enum=zero  string= string-list={,}}
 // CHECK_10-NEXT:   )
 // CHECK_10-NEXT: )
+
+// CHECK_11:      builtin.module(
+// CHECK_11-NEXT:   func.func(
+// CHECK_11-NEXT:     test-options-pass-a
+// CHECK_11-NEXT:   )
+// CHECK_11-NEXT:   func.func(
+// CHECK_11-NEXT:     test-options-pass-b
+// CHECK_11-NEXT:   )
+// CHECK_11-NEXT: )
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 3af95db3c0e2..0f8d757086e8 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -548,3 +548,47 @@ func.func @test_atomic_yield(%I: memref<10xf32>, %idx : index) {
   func.return
 }
 
+// -----
+
+// CHECK-LABEL: module @return_void_with_unused_argument
+module @return_void_with_unused_argument {
+  // CHECK-LABEL: func.func private @fn_return_void_with_unused_argument
+  // CHECK-SAME: (%[[ARG0_FN:.*]]: i32)
+  func.func private @fn_return_void_with_unused_argument(%arg0: i32, %arg1: memref<4xi32>) -> () {
+    %sum = arith.addi %arg0, %arg0 : i32
+    %c0 = arith.constant 0 : index
+    %buf = memref.alloc() : memref<1xi32>
+    memref.store %sum, %buf[%c0] : memref<1xi32>
+    return
+  }
+  // CHECK-LABEL: func.func @main
+  // CHECK-SAME: (%[[ARG0_MAIN:.*]]: i32)
+  // CHECK: call @fn_return_void_with_unused_argument(%[[ARG0_MAIN]]) : (i32) -> ()
+  func.func @main(%arg0: i32) -> memref<4xi32> {
+    %unused = memref.alloc() : memref<4xi32>
+    call @fn_return_void_with_unused_argument(%arg0, %unused) : (i32, memref<4xi32>) -> ()
+    return %unused : memref<4xi32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: module @dynamically_unreachable
+module @dynamically_unreachable {
+  func.func @dynamically_unreachable() {
+    // This value is used by an operation in a dynamically unreachable block.
+    %zero = arith.constant 0 : i64
+
+    // Dataflow analysis knows from the constant condition that
+    // ^bb1 is unreachable
+    %false = arith.constant false
+    cf.cond_br %false, ^bb1, ^bb4
+  ^bb1:
+    // This unreachable operation should be removed.
+    // CHECK-NOT: arith.cmpi
+    %3 = arith.cmpi eq, %zero, %zero : i64
+    cf.br ^bb1
+  ^bb4:
+    return
+  }
+}
diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir
index 0fc822b0a23a..8cad6b98441d 100644
--- a/mlir/test/Transforms/test-canonicalize.mlir
+++ b/mlir/test/Transforms/test-canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s  --check-prefixes=CHECK,RS
 // RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{region-simplify=disabled}))' | FileCheck %s --check-prefixes=CHECK,NO-RS
 
 // CHECK-LABEL: func @remove_op_with_inner_ops_pattern
@@ -80,12 +80,10 @@ func.func @test_dialect_canonicalizer() -> (i32) {
 
 // Check that the option to control region simplification actually works
 // CHECK-LABEL: test_region_simplify
-func.func @test_region_simplify() {
-  // CHECK-NEXT:   return
-  // NO-RS-NEXT: ^bb1
-  // NO-RS-NEXT:   return
-  // CHECK-NEXT: }
-  return
-^bb1:
-  return
+func.func @test_region_simplify(%input1 : i32, %cond : i1) -> i32 {
+  // RS-NEXT: "test.br"(%arg0)[^bb1] : (i32) -> ()
+  // NO-RS-NEXT: "test.br"(%arg0, %arg0)[^bb1] : (i32, i32) -> ()
+   "test.br"(%input1, %input1)[^bb1] : (i32, i32) -> ()
+^bb1(%used_arg : i32, %unused_arg : i32):
+  return %used_arg : i32
 }
diff --git a/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
index 43005e22584c..8e2f03b644e4 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
@@ -33,7 +33,6 @@ struct TestLivenessAnalysisPass
 
   void runOnOperation() override {
     auto &livenessAnalysis = getAnalysis<RunLivenessAnalysis>();
-
     Operation *op = getOperation();
 
     raw_ostream &os = llvm::outs();
diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp
index 7afe2109f04d..2b5f75ef53f1 100644
--- a/mlir/test/lib/Pass/TestPassManager.cpp
+++ b/mlir/test/lib/Pass/TestPassManager.cpp
@@ -133,6 +133,51 @@ struct TestOptionsSuperPass
       llvm::cl::desc("Example list of PassPipelineOptions option")};
 };
 
+struct TestOptionsPassA
+    : public PassWrapper<TestOptionsPassA, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassA)
+
+  struct Options : public PassPipelineOptions<Options> {
+    Option<bool> foo{*this, "foo", llvm::cl::desc("Example boolean option")};
+  };
+
+  TestOptionsPassA() = default;
+  TestOptionsPassA(const TestOptionsPassA &) : PassWrapper() {}
+  TestOptionsPassA(const Options &options) { this->options.foo = options.foo; }
+
+  void runOnOperation() final {}
+  StringRef getArgument() const final { return "test-options-pass-a"; }
+  StringRef getDescription() const final {
+    return "Test superset options parsing capabilities - subset A";
+  }
+
+  Options options;
+};
+
+struct TestOptionsPassB
+    : public PassWrapper<TestOptionsPassB, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassB)
+
+  struct Options : public PassPipelineOptions<Options> {
+    Option<bool> bar{*this, "bar", llvm::cl::desc("Example boolean option")};
+  };
+
+  TestOptionsPassB() = default;
+  TestOptionsPassB(const TestOptionsPassB &) : PassWrapper() {}
+  TestOptionsPassB(const Options &options) { this->options.bar = options.bar; }
+
+  void runOnOperation() final {}
+  StringRef getArgument() const final { return "test-options-pass-b"; }
+  StringRef getDescription() const final {
+    return "Test superset options parsing capabilities - subset B";
+  }
+
+  Options options;
+};
+
+struct TestPipelineOptionsSuperSetAB : TestOptionsPassA::Options,
+                                       TestOptionsPassB::Options {};
+
 /// A test pass that always aborts to enable testing the crash recovery
 /// mechanism of the pass manager.
 struct TestCrashRecoveryPass
@@ -270,6 +315,9 @@ void registerPassManagerTestPass() {
   PassRegistration<TestOptionsPass>();
   PassRegistration<TestOptionsSuperPass>();
 
+  PassRegistration<TestOptionsPassA>();
+  PassRegistration<TestOptionsPassB>();
+
   PassRegistration<TestModulePass>();
 
   PassRegistration<TestFunctionPass>();
@@ -306,5 +354,16 @@ void registerPassManagerTestPass() {
           [](OpPassManager &pm, const TestOptionsSuperPass::Options &options) {
             pm.addPass(std::make_unique<TestOptionsSuperPass>(options));
           });
+
+  PassPipelineRegistration<TestPipelineOptionsSuperSetAB>
+      registerPipelineOptionsSuperSetABPipeline(
+          "test-options-super-set-ab-pipeline",
+          "Parses options of PassPipelineOptions using pass pipeline "
+          "registration",
+          [](OpPassManager &pm, const TestPipelineOptionsSuperSetAB &options) {
+            // Pass superset AB options to subset options A and B
+            pm.addPass(std::make_unique<TestOptionsPassA>(options));
+            pm.addPass(std::make_unique<TestOptionsPassB>(options));
+          });
 }
 } // namespace mlir
diff --git a/mlir/test/mlir-tblgen/enums-python-bindings.td b/mlir/test/mlir-tblgen/enums-python-bindings.td
index 1c5567f54a5f..cd23b6a2effb 100644
--- a/mlir/test/mlir-tblgen/enums-python-bindings.td
+++ b/mlir/test/mlir-tblgen/enums-python-bindings.td
@@ -62,12 +62,15 @@ def MyEnum64 : I64EnumAttr<"MyEnum64", "An example 64-bit enum", [One64, Two64]>
 // CHECK: def _myenum64(x, context):
 // CHECK:     return _ods_ir.IntegerAttr.get(_ods_ir.IntegerType.get_signless(64, context=context), int(x))
 
+def User : I32BitEnumAttrCaseBit<"User", 0, "user">;
+def Group : I32BitEnumAttrCaseBit<"Group", 1, "group">;
+def Other : I32BitEnumAttrCaseBit<"Other", 2, "other">;
+
 def TestBitEnum
-    : I32BitEnumAttr<"TestBitEnum", "", [
-        I32BitEnumAttrCaseBit<"User", 0, "user">,
-        I32BitEnumAttrCaseBit<"Group", 1, "group">,
-        I32BitEnumAttrCaseBit<"Other", 2, "other">,
-      ]> {
+    : I32BitEnumAttr<
+          "TestBitEnum", "",
+          [User, Group, Other,
+           I32BitEnumAttrCaseGroup<"Any", [User, Group, Other], "any">]> {
   let genSpecializedAttr = 0;
   let separator = " | ";
 }
@@ -79,9 +82,10 @@ def TestBitEnum_Attr : EnumAttr<Test_Dialect, TestBitEnum, "testbitenum">;
 // CHECK:     User = 1
 // CHECK:     Group = 2
 // CHECK:     Other = 4
+// CHECK:     Any = 7
 
 // CHECK:     def __iter__(self):
-// CHECK:         return iter([case for case in type(self) if (self & case) is case])
+// CHECK:         return iter([case for case in type(self) if (self & case) is case and self is not case])
 // CHECK:     def __len__(self):
 // CHECK:         return bin(self).count("1")
 
@@ -94,6 +98,8 @@ def TestBitEnum_Attr : EnumAttr<Test_Dialect, TestBitEnum, "testbitenum">;
 // CHECK:             return "group"
 // CHECK:         if self is TestBitEnum.Other:
 // CHECK:             return "other"
+// CHECK:         if self is TestBitEnum.Any:
+// CHECK:             return "any"
 // CHECK:         raise ValueError("Unknown TestBitEnum enum entry.")
 
 // CHECK: @register_attribute_builder("TestBitEnum")
diff --git a/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
index 8e2d6114e48e..acc9b61d7121 100644
--- a/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
@@ -64,7 +64,7 @@ static void emitEnumClass(EnumInfo enumInfo, raw_ostream &os) {
   if (enumInfo.isBitEnum()) {
     os << formatv("    def __iter__(self):\n"
                   "        return iter([case for case in type(self) if "
-                  "(self & case) is case])\n");
+                  "(self & case) is case and self is not case])\n");
     os << formatv("    def __len__(self):\n"
                   "        return bin(self).count(\"1\")\n");
     os << "\n";
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
index 887db227cfc4..312b10f28143 100644
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -205,7 +205,13 @@ TEST(NativeMemRefJit, SKIP_WITHOUT_JIT(BasicMemref)) {
   };
   int64_t shape[] = {k, m};
   int64_t shapeAlloc[] = {k + 1, m + 1};
-  OwningMemRef<float, 2> a(shape, shapeAlloc, init);
+  // Use a large alignment to stress the case where the memref data/basePtr are
+  // disjoint.
+  int alignment = 8192;
+  OwningMemRef<float, 2> a(shape, shapeAlloc, init, alignment);
+  ASSERT_EQ(
+      (void *)(((uintptr_t)a->basePtr + alignment - 1) & ~(alignment - 1)),
+      a->data);
   ASSERT_EQ(a->sizes[0], k);
   ASSERT_EQ(a->sizes[1], m);
   ASSERT_EQ(a->strides[0], m + 1);
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index d22afb3003e7..a46e64718dab 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_unittest(MLIRIRTests
   AttrTypeReplacerTest.cpp
   Diagnostic.cpp
   DialectTest.cpp
+  DistinctAttributeAllocatorTest.cpp
   InterfaceTest.cpp
   IRMapping.cpp
   InterfaceAttachmentTest.cpp
diff --git a/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
new file mode 100644
index 000000000000..99067d09f7be
--- /dev/null
+++ b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
@@ -0,0 +1,45 @@
+//=== DistinctAttributeAllocatorTest.cpp - DistinctAttr storage alloc test ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/CrashRecoveryContext.h"
+#include <thread>
+
+using namespace mlir;
+
+//
+// Test that a DistinctAttr that is created on a separate thread does
+// not have its storage deleted when the thread joins.
+//
+TEST(DistinctAttributeAllocatorTest, TestAttributeWellFormedAfterThreadJoin) {
+  MLIRContext ctx;
+  OpBuilder builder(&ctx);
+  DistinctAttr attr;
+
+  std::thread t([&ctx, &attr]() {
+    attr = DistinctAttr::create(UnitAttr::get(&ctx));
+    ASSERT_TRUE(attr);
+  });
+  t.join();
+
+  // If the attribute storage got deleted after the thread joins (which we don't
+  // want) then trying to access it triggers an assert in Debug mode, and a
+  // crash otherwise. Run this in a CrashRecoveryContext to avoid bringing down
+  // the whole test suite if this test fails. Additionally, MSAN and/or TSAN
+  // should raise failures here if the attribute storage was deleted.
+  llvm::CrashRecoveryContext crc;
+  EXPECT_TRUE(crc.RunSafely([attr]() { (void)attr.getAbstractAttribute(); }));
+  EXPECT_TRUE(
+      crc.RunSafely([attr]() { (void)*cast<Attribute>(attr).getImpl(); }));
+
+  ASSERT_TRUE(attr);
+}
diff --git a/qualcomm-software/README.md b/qualcomm-software/README.md
new file mode 100644
index 000000000000..eb1b6130e25b
--- /dev/null
+++ b/qualcomm-software/README.md
@@ -0,0 +1,15 @@
+## CPULLVM Toolchain for Embedded
+
+### Welcome to the CPULLVM Toolchain project!
+This repository provides the source code and build scripts for CPULLVM, a customized fork
+of LLVM designed to build toolchains targeting Arm and AArch64 bare-metal and linux environments.
+
+## Project Goal
+Our mission is to deliver a robust LLVM-based platform with all the essential libraries and tools
+required for building C and C++ toolchains optimized for embedded development.
+
+## Quick Links
+### [Building CPULLVM Toolchain for Embedded](https://github.com/qualcomm/cpullvm-toolchain/blob/release/qualcomm-software/21.x/qualcomm-software/embedded/README.md)
+
+### Getting releases binary - TBD
+
diff --git a/qualcomm-software/embedded/README.md b/qualcomm-software/embedded/README.md
new file mode 100644
index 000000000000..87217ba3964b
--- /dev/null
+++ b/qualcomm-software/embedded/README.md
@@ -0,0 +1,94 @@
+
+# CPULLVM Toolchain for Embedded
+
+This repository contains build scripts and auxiliary material for building Linux and bare-metal LLVM-based toolchains, including:
+
+- clang + llvm
+- eld linker
+- lld
+- libc++abi
+- libc++
+- compiler-rt for Linux Arm/AArch64
+- compiler-rt for bare-metal Arm/AArch64
+- musl-embedded
+
+## Targets Built
+- Arm
+- AArch64
+
+## Enabled Projects
+- llvm
+- clang
+- polly
+- lld
+- mlir
+- eld
+
+## Goal
+The CPULLVM Compiler generates code for Arm and AArch64 targets only. It does **not** generate code for other targets supported by the upstream LLVM compiler.
+
+## Components
+The CPULLVM toolchain for Embedded relies on the following upstream components:
+
+- [CPULLVM](https://github.com/qualcomm/cpullvm-toolchain)
+- [musl-embedded](https://github.com/qualcomm/musl-embedded)
+- [eld linker](https://github.com/qualcomm/eld)
+
+## Host Platforms
+CPULLVM Toolchain for Embedded is built and tested on
+- Linux Ubuntu 22.04 LTS
+
+## Getting started
+
+## Prerequisites for building toolchain 
+
+   ### CPULLVM Build Environment Setup 
+   This guide lists required tools and sets up Clang 19 as host compiler for building CPULLVM.  
+
+   ### Install CMake and Ninja
+   These are essential build tools for LLVM.
+      
+    sudo apt install cmake ninja-build
+
+   ### Download LLVM 19 installer script
+   Fetch the official LLVM installation script and make it executable.
+      
+    wget https://apt.llvm.org/llvm.sh
+    chmod +x llvm.sh
+
+   ### Install Clang 19 and verify
+   Run the script to install Clang 19, then check the version.
+      
+    sudo ./llvm.sh 19
+    clang --version
+
+   ### Set Clang 19 as default compiler
+   Use update-alternatives to make Clang 19 the system default.
+   
+    sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-19 100
+    sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-19 100
+
+   ### Install libc++ and libc++abi
+   These provide C++ standard library support for Clang.
+      
+    sudo apt-get install libc++-19-dev libc++abi-19-dev
+
+   ### Install cross-compilers for Arm/AArch64
+   Required for building Arm and AArch64 targets.
+      
+    sudo apt-get install gcc-arm-linux-gnueabi
+    sudo apt-get install gcc-aarch64-linux-gnu
+
+## Steps to build the CPULLVM compiler toolchain
+
+   ### Clone the cpullvm-toolchain repository  
+   
+    git clone https://github.com/qualcomm/cpullvm-toolchain
+      
+   ### Navigate to the scripts directory
+   
+    cd cpullvm-toolchain/qualcomm-software/embedded/scripts
+      
+   ### Run the script 
+   
+    ./build.sh
diff --git a/qualcomm-software/embedded/patches/eld/0001-Explicitly-set-aarch64-unknown-none-elf-triple-for-a.patch b/qualcomm-software/embedded/patches/eld/0001-Explicitly-set-aarch64-unknown-none-elf-triple-for-a.patch
new file mode 100644
index 000000000000..8b3ff1e835ef
--- /dev/null
+++ b/qualcomm-software/embedded/patches/eld/0001-Explicitly-set-aarch64-unknown-none-elf-triple-for-a.patch
@@ -0,0 +1,43 @@
+From 915493d714518dc799a5f5b25822ade9fb3ce4d8 Mon Sep 17 00:00:00 2001
+From: nshanmug <nshanmug@quicinc.com>
+Date: Thu, 18 Sep 2025 10:34:41 -0700
+Subject: [PATCH] [PATCH] Explicitly set aarch64-unknown-none-elf triple for
+ aarch64 tests
+
+ELD takes LLVM_DEFAULT_TARGET_TRIPLE but tests are expecting none-linux output.
+Configuring LLVM during build to targeting aarch64-unknown-linux-gnueabi by
+default causes several test failures.
+
+Those tests are passing when LLVM is configured for aarch64-linux-gnueabi, which
+should behavior identically as aarch64-unknown-linux-gnueabi. ELD is relying on
+buggy behavior of LLVM's triple passing.
+
+We'll open a bug with upstream ELD.
+
+Change-Id: I8f8ce66fa847c15150466cc65da18b36ef85f671
+---
+ test/lit.cfg | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/test/lit.cfg b/test/lit.cfg
+index 10d2e5c..0a4459a 100644
+--- a/test/lit.cfg
++++ b/test/lit.cfg
+@@ -355,10 +355,11 @@ if config.test_target == 'AArch64':
+         'aarch64-linux' in config.available_features):
+         config.available_features.add('aarch64')
+     xlen = 8
+-    link = 'ld.eld'
+     config.emulation = '-m aarch64elf'
+-    clang ='clang -target aarch64'
+-    clangxx ='clang++ -target aarch64'
++    link = 'ld.eld -m aarch64elf -mtriple aarch64-unknown-none-elf'
++    clang = 'clang -target aarch64-unknown-none-elf'
++    clangxx = 'clang++ -target aarch64-unknown-none-elf'
++
+     clangas = 'clang'
+     linkopts = '--thread-count 4  --threads'
+     if hasattr(config,'eld_option') and config.eld_option != 'default':
+-- 
+2.17.1
+
diff --git a/qualcomm-software/embedded/patches/llvm-project/0001-ARM-compiler-rt-Add-a-few-more-variants-for-builtins.patch b/qualcomm-software/embedded/patches/llvm-project/0001-ARM-compiler-rt-Add-a-few-more-variants-for-builtins.patch
new file mode 100644
index 000000000000..a476afd07e05
--- /dev/null
+++ b/qualcomm-software/embedded/patches/llvm-project/0001-ARM-compiler-rt-Add-a-few-more-variants-for-builtins.patch
@@ -0,0 +1,50 @@
+From 0b68523d72053247799419f0f60be7b1835f814e Mon Sep 17 00:00:00 2001
+From: nshanmug <nshanmug@quicinc.com>
+Date: Wed, 17 Sep 2025 13:32:14 -0700
+Subject: [PATCH] [ARM][compiler-rt] Add a few more variants for builtins
+
+We add the following variants: pacret, pacret-bti and pacret-b-key-bti. These
+are needed by the corresponding musl variants.
+
+Change-Id: I003c363b8bacd76f74704c7d1a7295441acb59ad
+---
+ compiler-rt/lib/builtins/CMakeLists.txt | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
+index 3ab92403d416..7f2f3c4959fe 100644
+--- a/compiler-rt/lib/builtins/CMakeLists.txt
++++ b/compiler-rt/lib/builtins/CMakeLists.txt
+@@ -964,6 +964,29 @@ else ()
+                               C_STANDARD 11
+                               CXX_STANDARD 17
+                               PARENT_TARGET builtins)
++      if(arch MATCHES "aarch64")
++        add_compiler_rt_runtime(clang_rt.builtins-pacret-b-key-bti
++                                STATIC
++                                ARCHS ${arch}
++                                SOURCES ${${arch}_SOURCES}
++                                DEFS "${BUILTIN_DEFS}"
++                                CFLAGS ${BUILTIN_CFLAGS_${arch}} -march=armv8.5-a -mbranch-protection=pac-ret+leaf+b-key+bti
++                                PARENT_TARGET builtins)
++        add_compiler_rt_runtime(clang_rt.builtins-pacret
++                                STATIC
++                                ARCHS ${arch}
++                                SOURCES ${${arch}_SOURCES}
++                                DEFS "${BUILTIN_DEFS}"
++                                CFLAGS ${BUILTIN_CFLAGS_${arch}} -march=armv8.5-a -mbranch-protection=pac-ret+leaf
++                                PARENT_TARGET builtins)
++        add_compiler_rt_runtime(clang_rt.builtins-pacret-bti
++                                STATIC
++                                ARCHS ${arch}
++                                SOURCES ${${arch}_SOURCES}
++                                DEFS "${BUILTIN_DEFS}"
++                                CFLAGS ${BUILTIN_CFLAGS_${arch}} -march=armv8.5-a -mbranch-protection=pac-ret+leaf+bti
++                                PARENT_TARGET builtins)
++      endif ()
+       cmake_pop_check_state()
+     endif ()
+   endforeach ()
+-- 
+2.17.1
+
diff --git a/qualcomm-software/embedded/patches/llvm-project/0002-Allow-baremetal-builds-for-AArch64-and-ARM32-v7.patch b/qualcomm-software/embedded/patches/llvm-project/0002-Allow-baremetal-builds-for-AArch64-and-ARM32-v7.patch
new file mode 100644
index 000000000000..73d115e457e5
--- /dev/null
+++ b/qualcomm-software/embedded/patches/llvm-project/0002-Allow-baremetal-builds-for-AArch64-and-ARM32-v7.patch
@@ -0,0 +1,33 @@
+From c6a5e4ed61dda760b270daf68dfea880a7fdc1c1 Mon Sep 17 00:00:00 2001
+From: Zhaoshi Zheng <zhaoshiz@quicinc.com>
+Date: Tue, 30 Sep 2025 14:20:09 -0700
+Subject: [PATCH] Allow baremetal builds for AArch64 and ARM32(v7)
+
+---
+ compiler-rt/cmake/base-config-ix.cmake | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/compiler-rt/cmake/base-config-ix.cmake b/compiler-rt/cmake/base-config-ix.cmake
+index d92bc0e71fa1..444704218393 100644
+--- a/compiler-rt/cmake/base-config-ix.cmake
++++ b/compiler-rt/cmake/base-config-ix.cmake
+@@ -272,6 +272,16 @@ macro(test_targets)
+     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "nvptx")
+       test_target_arch(nvptx64 "" "--nvptx64-nvidia-cuda" "-nogpulib" "-flto"
+                        "-fconvergent-functions" "-c")
++    # Rule for aarch64 baremetal library
++    # Match the entire triple
++    elseif ("${COMPILER_RT_DEFAULT_TARGET_TRIPLE}" MATCHES "aarch64-unknown-none-elf")
++      set (CAN_TARGET_aarch64 1)
++      list(APPEND COMPILER_RT_SUPPORTED_ARCH aarch64)
++    # Rule for armv7 baremetal library
++    # Match the entire triple
++    elseif ("${COMPILER_RT_DEFAULT_TARGET_TRIPLE}" MATCHES "arm-unknown-none-eabi")
++      set (CAN_TARGET_armv7 1)
++      list(APPEND COMPILER_RT_SUPPORTED_ARCH armv7)
+     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "arm")
+       if(WIN32)
+         test_target_arch(arm "" "" "")
+-- 
+2.34.1
+
diff --git a/qualcomm-software/embedded/patchsets.yml b/qualcomm-software/embedded/patchsets.yml
new file mode 100644
index 000000000000..86d38a6e1cd6
--- /dev/null
+++ b/qualcomm-software/embedded/patchsets.yml
@@ -0,0 +1,28 @@
+{
+  "version": 1,
+  "defaults": {
+    "method": "am",
+    "three_way": true,
+    "restore_on_fail": true,
+    "ignore_whitespace": true
+  },
+  "patchsets": [
+    {
+      "name": "eld",
+      "repo": "../../llvm/tools/eld",
+      "patches": "patches/eld",
+      "reset_to": ""
+    },
+    {
+      "name": "llvm-project",
+      "repo": "../..",
+      "patches": "patches/llvm-project"
+    },
+    {
+      "name": "musl-embedded",
+      "repo": "../../../musl-embedded",
+      "patches": "patches/musl-embedded"
+    }
+  ]
+}
+
diff --git a/qualcomm-software/embedded/scripts/build.sh b/qualcomm-software/embedded/scripts/build.sh
new file mode 100755
index 000000000000..fd290986407e
--- /dev/null
+++ b/qualcomm-software/embedded/scripts/build.sh
@@ -0,0 +1,369 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# A bash script to build cpullvm toolchain
+# The script automates cloning, patching, building LLVM, ELD linker and musl-embedded,
+# and packaging artifacts for ARM and AArch64 targets.
+
+# Note: Ensure that `ELD_BRANCH` and `MUSL_EMBEDDED_BRANCH` match the current repository branch
+# to maintain consistency across all dependencies.
+
+set -euo pipefail
+
+readonly ELD_REPO_URL="https://github.com/qualcomm/eld.git"
+readonly ELD_BRANCH="release/21.x"
+
+readonly MUSL_EMBEDDED_REPO_URL="https://github.com/qualcomm/musl-embedded.git"
+readonly MUSL_EMBEDDED_BRANCH="main"
+
+SCRIPT_DIR=$(
+  cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd
+)
+REPO_ROOT="$( git -C "${SCRIPT_DIR}" rev-parse --show-toplevel )"
+WORKSPACE="${REPO_ROOT}/.."
+SRC_DIR="${REPO_ROOT}"
+BUILD_DIR="${WORKSPACE}/build"
+INSTALL_DIR="${WORKSPACE}/install"
+ARTIFACT_DIR=""
+SKIP_TESTS="false"
+JOBS="${JOBS:-$(nproc)}"
+
+log() { echo -e "\033[1;34m[precheckin]\033[0m $*"; }
+warn() { echo -e "\033[1;33m[warn]\033[0m $*"; }
+
+usage() {
+  cat <<'EOF'
+Usage:
+  build.sh [options]
+
+Options:
+  --artifact-dir <path>       Directory to copy final tarball
+  --skip-tests                Skip LLVM test steps
+  --aarch64-sysroot <path>    AArch64 sysroot (default: /usr/aarch64-linux-gnu)
+  --clean                     Delete and recreate build/install dirs
+
+Examples:
+  ./build.sh --artifact-dir /tmp/artifacts
+EOF
+}
+
+CLEAN="false"
+
+# --- Parse args ---
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --artifact-dir) ARTIFACT_DIR="$2"; shift 2 ;;
+    --skip-tests) SKIP_TESTS="true"; shift ;;
+    --aarch64-sysroot) AARCH64_SYSROOT="$2"; shift 2 ;;
+    --clean) CLEAN="true"; shift ;;
+    -h|--help) usage; exit 0 ;;
+    *) echo "Unknown arg: $1"; usage; exit 1 ;;
+  esac
+done
+
+# --- Set the Build flags ---
+BUILD_MODE="Release"
+ASSERTION_MODE="OFF"
+ARM32_LINUX_TRIPLE="arm-linux-gnueabi"
+AARCH64_LINUX_TRIPLE="aarch64-linux-gnu"
+COMPILER_RT_ARM32_LINUX_BUILDDIR="${WORKSPACE}/build/compiler-rt/arm32/linux"
+COMPILER_RT_AARCH64_LINUX_BUILDDIR="${WORKSPACE}/build/compiler-rt/aarch64/linux"
+COMPILER_RT_ARM32_LINUX_FLAGS="--target=arm-linux-gnueabi -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon"
+COMPILER_RT_AARCH64_LINUX_FLAGS="--sysroot=${AARCH64_SYSROOT:-/usr/aarch64-linux-gnu} --target=aarch64-linux-gnu -mcpu=cortex-a53"
+ARM32_BM_TRIPLE="arm-none-eabi"
+AARCH64_BM_TRIPLE="aarch64-none-elf"
+COMPILER_RT_ARM32_BM_BUILDDIR="${WORKSPACE}/build/compiler-rt/arm32/baremetal"
+COMPILER_RT_AARCH64_BM_BUILDDIR="${WORKSPACE}/build/compiler-rt/aarch64/baremetal"
+COMPILER_RT_ARM32_BM_FLAGS="--target=arm-none-eabi -mcpu=cortex-a9 -ffunction-sections -fdata-sections -mfloat-abi=softfp -mfpu=neon -nostdlibinc"
+COMPILER_RT_AARCH64_BM_FLAGS="--target=aarch64-none-elf -mcpu=cortex-a53 -ffunction-sections -fdata-sections -nostdlibinc"
+
+# --- Prepare build/install dirs ---
+if [[ "${CLEAN}" == "true" ]]; then
+  log "Cleaning ${BUILD_DIR} and ${INSTALL_DIR}"
+  rm -rf "${BUILD_DIR}" "${INSTALL_DIR}"
+fi
+
+# --- Workspace prep ---
+log "Preparing workspace at: ${WORKSPACE}"
+mkdir -p "${BUILD_DIR}" "${INSTALL_DIR}"
+
+# --- Clone musl-embedded (if absent) ---
+if [[ ! -d "${WORKSPACE}/musl-embedded/.git" ]]; then
+  log "Cloning musl-embedded into ${WORKSPACE}/musl-embedded"
+  git clone "${MUSL_EMBEDDED_REPO_URL}" "${WORKSPACE}/musl-embedded" -b "${MUSL_EMBEDDED_BRANCH}"
+else
+  log "musl-embedded already present, leaving as-is"
+fi
+
+# --- Clone ELD under llvm/tools (if absent) ---
+if [[ ! -d "${REPO_ROOT}/llvm/tools/eld/.git" ]]; then
+  log "Cloning ELD to ${REPO_ROOT}/llvm/tools/eld"
+  git clone "${ELD_REPO_URL}" "${SRC_DIR}/llvm/tools/eld" -b "${ELD_BRANCH}"
+  # Pin ELD to known commit
+  pushd "${SRC_DIR}/llvm/tools/eld" >/dev/null
+  git checkout "65ea860802c41ef5c0becff9750a350495de27b0"
+  popd >/dev/null
+else
+  log "ELD already present under llvm/tools, leaving as-is"
+fi
+
+# --- Apply patches ---
+log "Applying patches"
+python3 "${SRC_DIR}/qualcomm-software/embedded/tools/patchctl.py" apply -f "${SRC_DIR}/qualcomm-software/embedded/patchsets.yml"
+
+# --- Build LLVM ---
+log "Configuring LLVM"
+mkdir -p "${BUILD_DIR}/llvm"
+pushd "${BUILD_DIR}/llvm" >/dev/null
+cmake -G Ninja -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
+  -DLLVM_TARGETS_TO_BUILD="ARM;AArch64" \
+  -DLLVM_EXTERNAL_PROJECTS="eld" \
+  -DLLVM_EXTERNAL_ELD_SOURCE_DIR="llvm/tools/eld" \
+  -DLLVM_DEFAULT_TARGET_TRIPLE="aarch64-unknown-linux-gnueabi" \
+  -DLLVM_TARGET_ARCH="arm-linux-gnueabi" \
+  -DLLVM_BUILD_RUNTIME="OFF" \
+  -DLIBCLANG_BUILD_STATIC="ON" -DLLVM_POLLY_LINK_INTO_TOOLS="ON" \
+  -DCMAKE_C_COMPILER="clang" -DCMAKE_CXX_COMPILER="clang++" \
+  -DCMAKE_CXX_FLAGS="-stdlib=libc++" \
+  -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+  -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+  -DLLVM_ENABLE_PROJECTS="llvm;clang;polly;lld;mlir" \
+  "${SRC_DIR}/llvm"
+
+log "Building LLVM"
+ninja
+log "Installing LLVM"
+ninja install
+popd >/dev/null
+
+if [[ "${SKIP_TESTS}" != "true" ]]; then
+  log "Running LLVM tests"
+  (cd "${BUILD_DIR}/llvm" && ninja check-llvm check-lld check-polly check-eld check-clang)
+else
+  warn "Skipping tests"
+fi
+
+# --- Compute clang resource dir ---
+RESOURCE_DIR="$("${INSTALL_DIR}/bin/clang" -print-resource-dir)"
+log "RESOURCE_DIR=${RESOURCE_DIR}"
+
+# --- Build compiler-rt for ARM ---
+log "Building compiler-rt for ARM"
+mkdir -p "${COMPILER_RT_ARM32_LINUX_BUILDDIR}"
+pushd "${COMPILER_RT_ARM32_LINUX_BUILDDIR}" >/dev/null
+cmake -G Ninja \
+  -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+  -DCMAKE_TRY_COMPILE_TARGET_TYPE="STATIC_LIBRARY" \
+  -DCMAKE_ASM_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+  -DCMAKE_C_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+  -DCMAKE_CXX_COMPILER_TARGET="${ARM32_LINUX_TRIPLE}" \
+  -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+  -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+  -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+  -DCMAKE_C_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+  -DCMAKE_CXX_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+  -DCMAKE_ASM_FLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+  -DCMAKE_SYSTEM_NAME="Generic" \
+  -DCOMPILER_RT_BUILD_BUILTINS="ON" \
+  -DCOMPILER_RT_BUILD_LIBFUZZER="OFF" \
+  -DCOMPILER_RT_DEFAULT_TARGET_ONLY="ON" \
+  -DCOMPILER_RT_OS_DIR="linux" \
+  -DCOMPILER_RT_TEST_TARGET_TRIPLE="${ARM32_LINUX_TRIPLE}" \
+  -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_ARM32_LINUX_FLAGS}" \
+  -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+  -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+  -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+  -DCXX_SUPPORTS_UNWINDLIB_NONE_FLAG:BOOL="OFF" \
+  "${SRC_DIR}/compiler-rt"
+ninja
+ninja install
+popd >/dev/null
+
+# --- Build compiler-rt for ARM baremetal ---
+log "Building compiler-rt for ARM baremetal"
+mkdir -p "${BUILD_DIR}/compiler-rt/arm32/baremetal"
+pushd "${BUILD_DIR}/compiler-rt/arm32/baremetal" >/dev/null
+cmake -G Ninja \
+    -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+    -DCMAKE_TRY_COMPILE_TARGET_TYPE="STATIC_LIBRARY" \
+    -DCMAKE_ASM_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+    -DCMAKE_C_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+    -DCMAKE_CXX_COMPILER_TARGET="${ARM32_BM_TRIPLE}" \
+    -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+    -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+    -DCMAKE_C_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+    -DCMAKE_ASM_FLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+    -DCMAKE_SYSTEM_NAME="Generic" \
+    -DCOMPILER_RT_BAREMETAL_BUILD="ON" \
+    -DCOMPILER_RT_BUILD_BUILTINS="ON" \
+    -DCOMPILER_RT_BUILD_LIBFUZZER="OFF" \
+    -DCOMPILER_RT_BUILD_PROFILE="OFF" \
+    -DCOMPILER_RT_BUILD_SANITIZERS="OFF" \
+    -DCOMPILER_RT_BUILD_XRAY="OFF" \
+    -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${ARM32_BM_TRIPLE}" \
+    -DCOMPILER_RT_OS_DIR="baremetal" \
+    -DCOMPILER_RT_TEST_TARGET_TRIPLE="${ARM32_BM_TRIPLE}" \
+    -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_ARM32_BM_FLAGS}" \
+    -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+    -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+    -DCXX_SUPPORTS_UNWINDLIB_NONE_FLAG:BOOL="OFF" \
+    "${SRC_DIR}/compiler-rt"
+ninja -t clean all
+ninja
+ninja install
+popd >/dev/null
+
+# --- Build compiler-rt for AArch64 ---
+log "Building compiler-rt for AArch64"
+mkdir -p "${COMPILER_RT_AARCH64_LINUX_BUILDDIR}"
+pushd "${COMPILER_RT_AARCH64_LINUX_BUILDDIR}" >/dev/null
+cmake -G Ninja \
+    -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+    -DCMAKE_TRY_COMPILE_TARGET_TYPE="STATIC_LIBRARY" \
+    -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+    -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+    -DCMAKE_C_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+    -DCMAKE_ASM_FLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+    -DCMAKE_SYSTEM_NAME="Generic" \
+    -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${AARCH64_LINUX_TRIPLE}" \
+    -DCOMPILER_RT_OS_DIR="linux" \
+    -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_AARCH64_LINUX_FLAGS}" \
+    -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+    -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+    "${SRC_DIR}/compiler-rt"
+ninja
+ninja install
+popd >/dev/null
+
+# --- Build compiler-rt for AArch64 baremetal ---
+log "Building compiler-rt for AArch64 baremetal"
+mkdir -p "${BUILD_DIR}/compiler-rt/aarch64/baremetal"
+pushd "${BUILD_DIR}/compiler-rt/aarch64/baremetal" >/dev/null
+cmake -G Ninja \
+    -DCMAKE_INSTALL_PREFIX="${RESOURCE_DIR}" \
+    -DCMAKE_TRY_COMPILE_TARGET_TYPE="STATIC_LIBRARY" \
+    -DCMAKE_C_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCMAKE_CXX_COMPILER="${INSTALL_DIR}/bin/clang++" \
+    -DCMAKE_PREFIX_PATH="${INSTALL_DIR}" \
+    -DCMAKE_C_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+    -DCMAKE_ASM_FLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+    -DCMAKE_SYSTEM_NAME="Generic" \
+    -DCOMPILER_RT_BAREMETAL_BUILD="ON" \
+    -DCOMPILER_RT_BUILD_BUILTINS="ON" \
+    -DCOMPILER_RT_BUILD_LIBFUZZER="OFF" \
+    -DCOMPILER_RT_BUILD_PROFILE="OFF" \
+    -DCOMPILER_RT_BUILD_SANITIZERS="OFF" \
+    -DCOMPILER_RT_BUILD_XRAY="OFF" \
+    -DCOMPILER_RT_DEFAULT_TARGET_TRIPLE="${AARCH64_BM_TRIPLE}" \
+    -DCOMPILER_RT_OS_DIR="baremetal" \
+    -DCOMPILER_RT_TEST_TARGET_TRIPLE="${AARCH64_BM_TRIPLE}" \
+    -DCOMPILER_RT_TEST_COMPILER="${INSTALL_DIR}/bin/clang" \
+    -DCOMPILER_RT_TEST_COMPILER_CFLAGS="${COMPILER_RT_AARCH64_BM_FLAGS}" \
+    -DCMAKE_BUILD_TYPE="${BUILD_MODE}" \
+    -DLLVM_ENABLE_ASSERTIONS:BOOL="${ASSERTION_MODE}" \
+    "${SRC_DIR}/compiler-rt"
+ninja -t clean all
+ninja
+ninja install
+popd >/dev/null
+
+# --- Build musl-embedded ---
+export PATH="${INSTALL_DIR}/bin:${PATH}"
+log "Building musl-embedded"
+MUSL_BUILDDIR="${WORKSPACE}/musl-embedded"
+source "${MUSL_BUILDDIR}/qualcomm-software/config/component_list.sh"
+for lib in ${musl_components[*]}; do
+  libName="$(echo "${lib}" | awk -F".sh," '{print $1}')"
+  dirName="$(echo "${lib}" | awk -F"," '{print $2}')"
+  pushd "${MUSL_BUILDDIR}" >/dev/null
+  make distclean
+  bash -x "${MUSL_BUILDDIR}/qualcomm-software/config/linux/arm/${libName}.sh" --prefix="${INSTALL_DIR}/${dirName}/libc"
+  make -j"${JOBS}"
+  make install
+  popd >/dev/null
+done
+
+# --- c++ libs ---
+echo "Build c++ libs ..."
+
+declare -A Triples
+Triples["aarch64-none-elf"]="aarch64-none-elf"
+Triples["aarch64-pacret-b-key-bti-none-elf"]="aarch64-none-elf"
+Triples["armv7-none-eabi"]="armv7-none-eabi"
+declare -A CFLAGS
+CFLAGS["aarch64-none-elf"]="-mcpu=cortex-a53 -nostartfiles"
+CFLAGS["aarch64-pacret-b-key-bti-none-elf"]="-mcpu=cortex-a53 -nostartfiles -march=armv8.5-a -mbranch-protection=pac-ret+leaf+b-key+bti"
+CFLAGS["armv7-none-eabi"]="-mcpu=cortex-a9 -mthumb -specs=nosys.specs"
+CFLAGS_RELEASE="-Os -DNDEBUG"
+for VARIANT in "aarch64-none-elf" "aarch64-pacret-b-key-bti-none-elf" "armv7-none-eabi"; do
+    TRIPLE="${Triples[${VARIANT}]}"
+    MUSL_INC="${INSTALL_DIR}/${TRIPLE}/libc/include"
+    CMAKE_CFLAGS="-target ${TRIPLE} -nostdinc -isystem ${MUSL_INC} -ccc-gcc-name ${TRIPLE}-g++ -fno-unroll-loops -fno-optimize-sibling-calls -ffunction-sections -fdata-sections -fno-exceptions -D_GNU_SOURCE ${CFLAGS[${VARIANT}]}"
+    mkdir -p "${BUILD_DIR}/${VARIANT}"
+    pushd "${BUILD_DIR}/${VARIANT}" >/dev/null
+    cmake -G Ninja -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}/${VARIANT}" -DCMAKE_BUILD_TYPE="Release" -DCMAKE_C_COMPILER="clang" -DCMAKE_CXX_COMPILER="clang++" \
+        -DHAVE_LIBCXXABI="True" -DCMAKE_SYSTEM_NAME="Generic" \
+        -DCMAKE_C_FLAGS_RELEASE="${CFLAGS_RELEASE}" \
+        -DCMAKE_CXX_FLAGS_RELEASE="${CFLAGS_RELEASE}" \
+        -DCMAKE_C_FLAGS="${CMAKE_CFLAGS}" \
+        -DCMAKE_CXX_FLAGS="${CMAKE_CFLAGS}" \
+        -DCMAKE_ASM_FLAGS="${CMAKE_CFLAGS}" \
+        -DCMAKE_TRY_COMPILE_TARGET_TYPE="STATIC_LIBRARY" \
+        -DLIBCXX_ENABLE_SHARED="False" \
+        -DLIBCXX_SHARED_OUTPUT_NAME="c++-shared" \
+        -DLIBCXX_ENABLE_EXCEPTIONS="False" \
+        -DLIBCXX_HAS_MUSL_LIBC="True" \
+        -DLIBCXX_ENABLE_ABI_LINKER_SCRIPT="False" \
+        -DLIBCXX_ENABLE_THREADS="False" \
+        -DLIBCXX_ENABLE_FILESYSTEM="False" \
+        -DLIBCXX_ENABLE_RANDOM_DEVICE="False" \
+        -DLIBCXX_ENABLE_LOCALIZATION="False" \
+        -DLIBCXX_SUPPORTS_STD_EQ_CXX11_FLAG="ON" \
+        -DLIBCXX_SUPPORTS_STD_EQ_CXX14_FLAG="ON" \
+        -DLIBCXX_SUPPORTS_STD_EQ_CXX17_FLAG="ON" \
+        -DLIBCXX_QUIC_BAREMETAL="ON" \
+        -DLIBCXXABI_USE_LLVM_UNWINDER="True" \
+        -DLIBCXXABI_BAREMETAL="True" \
+        -DLIBCXXABI_ENABLE_SHARED="False" \
+        -DLIBCXXABI_SHARED_OUTPUT_NAME="c++abi-shared" \
+        -DLIBCXXABI_ENABLE_WERROR="True" \
+        -DLIBCXXABI_ENABLE_THREADS="False" \
+        -DLIBCXXABI_ENABLE_ASSERTIONS="False" \
+        -DLIBCXXABI_ENABLE_EXCEPTIONS="False" \
+        -DLIBUNWIND_IS_BAREMETAL="True" \
+        -DLIBUNWIND_ENABLE_SHARED="False" \
+        -DLIBUNWIND_SHARED_OUTPUT_NAME="unwind-shared" \
+        -DUNIX="True" \
+        -S "${SRC_DIR}/runtimes" "-DLLVM_ENABLE_RUNTIMES=libcxx;libcxxabi;libunwind"
+    ninja -t clean all
+    ninja
+    ninja install
+    popd >/dev/null
+echo "c++ libs install ..."
+done
+echo "Build and installation complete."
+
+# --- Create artifact ---
+log "Creating artifact tarball"
+pushd "${INSTALL_DIR}" >/dev/null
+short_sha="$(git -C "${SRC_DIR}" rev-parse --short HEAD)"
+tar_file="${ELD_BRANCH}_${short_sha}_$(date +%Y%m%d).tgz"
+tar -czvf "${BUILD_DIR}/${tar_file}" "."
+popd >/dev/null
+
+if [[ -n "${ARTIFACT_DIR}" ]]; then
+  mkdir -p "${ARTIFACT_DIR}"
+  cp "${BUILD_DIR}/${tar_file}" "${ARTIFACT_DIR}/"
+  log "Artifact copied to ${ARTIFACT_DIR}/${tar_file}"
+else
+  warn "Artifact left at ${BUILD_DIR}/${tar_file}"
+fi
diff --git a/qualcomm-software/embedded/tools/patchctl.py b/qualcomm-software/embedded/tools/patchctl.py
new file mode 100644
index 000000000000..d6934547743c
--- /dev/null
+++ b/qualcomm-software/embedded/tools/patchctl.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse, hashlib, json, os, subprocess, sys, textwrap
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+PY_MIN = (3, 8)
+if sys.version_info < PY_MIN:
+    print(f"Python {PY_MIN[0]}.{PY_MIN[1]}+ is required.", file=sys.stderr)
+    sys.exit(2)
+
+def sh(*cmd: str, cwd: Optional[Path] = None, check: bool = False, capture: bool = False):
+    return subprocess.run(cmd, cwd=str(cwd) if cwd else None,
+                          text=True, capture_output=capture, check=check)
+
+def git(*args: str, cwd: Path, check: bool = False, capture: bool = False):
+    return sh("git", *args, cwd=cwd, check=check, capture=capture)
+
+def load_yaml(path: Path) -> dict:
+    # JSON-in-YAML
+    import re
+    content = path.read_text(encoding="utf-8")
+    lines = [ln for ln in content.splitlines() if not ln.strip().startswith("#")]
+    try:
+        return json.loads("\n".join(lines))
+    except json.JSONDecodeError:
+        raise SystemExit(f"Use JSON-compatible YAML for {path} or switch to a YAML parser.")
+
+def compute_series_hash(files: List[Path]) -> str:
+    h = hashlib.sha256()
+    for p in files:
+        h.update(p.name.encode())
+        h.update(p.read_bytes())
+    return h.hexdigest()[:12]
+
+@dataclass
+class PatchSet:
+    name: str
+    repo: Path
+    patch_dir: Path
+    method: str
+    three_way: bool
+    restore_on_fail: bool
+    ignore_whitespace: bool
+    reset_to: str
+    ensure_identity: Optional[dict]
+
+@dataclass
+class RepoSnapshot:
+    head: str
+
+STATE_DIR = Path(".git") / "patchctl"
+STATE_FILE = STATE_DIR / "state.json"
+
+def collect_patches(d: Path) -> List[Path]:
+    if not d.exists():
+        return []
+    return sorted([p for p in d.glob("*.patch") if p.is_file()])
+
+def preflight_apply_check(repo: Path, patches: List[Path], three_way: bool, ignore_ws: bool) -> bool:
+    # Dry-run using 'git apply --check' for each patch
+    for patch in patches:
+        cmd = ["apply", "--check"]
+        if ignore_ws: cmd.append("--ignore-whitespace")
+        if three_way: cmd.append("--3way")
+        cmd.append(str(patch))
+        rc = git(*cmd, cwd=repo).returncode
+        if rc != 0:
+            print(f"[preflight] {repo} would fail on: {patch.name}")
+            return False
+    return True
+
+def read_state(repo: Path) -> dict:
+    f = repo / STATE_FILE
+    if f.exists():
+        try:
+            return json.loads(f.read_text(encoding="utf-8"))
+        except Exception:
+            return {}
+    return {}
+
+def write_state(repo: Path, data: dict):
+    f = repo / STATE_FILE
+    (repo / STATE_DIR).mkdir(parents=True, exist_ok=True)
+    f.write_text(json.dumps(data, indent=2), encoding="utf-8")
+
+def snapshot_repo(repo: Path) -> RepoSnapshot:
+    head = git("rev-parse", "HEAD", cwd=repo, capture=True).stdout.strip()
+    return RepoSnapshot(head=head)
+
+def reset_repo(repo: Path, to: str):
+    git("reset", "--hard", to, cwd=repo, check=True)
+    git("clean", "-fdx", cwd=repo, check=True)
+
+def already_applied(repo: Path, patchset_name: str, series_hash: str) -> bool:
+    st = read_state(repo)
+    return st.get("patchsets", {}).get(patchset_name, "") == series_hash
+
+def stamp_applied(repo: Path, patchset_name: str, series_hash: str):
+    st = read_state(repo)
+    ps = st.get("patchsets", {})
+    ps[patchset_name] = series_hash
+    st["patchsets"] = ps
+    write_state(repo, st)
+
+def apply_with_am(repo: Path, patches: List[Path], three_way: bool, ignore_ws: bool) -> int:
+    args = ["am", "-k"]
+    if three_way: args.append("--3way")
+    if ignore_ws: args.append("--ignore-whitespace")
+    args += [str(p) for p in patches]
+    r = git(*args, cwd=repo, capture=True)
+    sys.stdout.write(r.stdout or "")
+    sys.stderr.write(r.stderr or "")
+    if r.returncode != 0:
+        # try abort if needed
+        git("am", "--abort", cwd=repo)
+    return r.returncode
+
+def apply_with_apply(repo: Path, patches: List[Path], three_way: bool, ignore_ws: bool) -> int:
+    applied: List[Path] = []
+    for p in patches:
+        check = ["apply", "--check"]
+        if ignore_ws: check.append("--ignore-whitespace")
+        if three_way: check.append("--3way")
+        check.append(str(p))
+        if git(*check, cwd=repo).returncode != 0:
+            if applied:
+                # rollback
+                for prev in reversed(applied):
+                    rev = ["apply", "--reverse"]
+                    if ignore_ws: rev.append("--ignore-whitespace")
+                    if three_way: rev.append("--3way")
+                    rev.append(str(prev))
+                    git(*rev, cwd=repo)
+            return 1
+        # apply
+        args = ["apply"]
+        if ignore_ws: args.append("--ignore-whitespace")
+        if three_way: args.append("--3way")
+        args.append(str(p))
+        if git(*args, cwd=repo).returncode != 0:
+            return 1
+        applied.append(p)
+    # stage and commit to make changes durable
+    git("add", "-A", cwd=repo, check=True)
+    msg = f"patchctl: applied {len(applied)} patches"
+    git("commit", "-m", msg, cwd=repo, check=True)
+    return 0
+
+def parse_manifest(mpath: Path) -> List[PatchSet]:
+    doc = load_yaml(mpath)
+    defaults = doc.get("defaults", {})
+    defm = defaults.get("method", "am")
+    thw = bool(defaults.get("three_way", True))
+    rof = bool(defaults.get("restore_on_fail", True))
+    igw = bool(defaults.get("ignore_whitespace", True))
+    ident = defaults.get("ensure_identity", None)
+    out: List[PatchSet] = []
+    base = mpath.parent
+    for ps in doc.get("patchsets", []):
+        out.append(PatchSet(
+            name=ps["name"],
+            repo=(base / ps["repo"]).resolve(),
+            patch_dir=(base / ps["patches"]).resolve(),
+            method=ps.get("method", defm),
+            three_way=bool(ps.get("three_way", thw)),
+            restore_on_fail=bool(ps.get("restore_on_fail", rof)),
+            ignore_whitespace=bool(ps.get("ignore_whitespace", igw)),
+            reset_to=str(ps.get("reset_to", "") or ""),
+            ensure_identity=ps.get("ensure_identity", ident),
+        ))
+    return out
+
+def cmd_apply(manifest: Path) -> int:
+    patchsets = parse_manifest(manifest)
+
+    # Collect all patches and preflight across repos first (transaction-friendly).
+    repo_to_data = []
+    for ps in patchsets:
+        if not (ps.repo / ".git").exists():
+            print(f"[error] Not a git repo: {ps.repo}")
+            return 2
+        patches = collect_patches(ps.patch_dir)
+        if not patches:
+            print(f"[info] no patches for {ps.name} at {ps.patch_dir} — skipping")
+            continue
+        series_hash = compute_series_hash(patches)
+        if already_applied(ps.repo, ps.name, series_hash):
+            print(f"[skip] {ps.name} already applied (series {series_hash})")
+            continue
+        if ps.reset_to:
+            print(f"[info] resetting {ps.name} to {ps.reset_to}")
+        snap = snapshot_repo(ps.repo)
+        repo_to_data.append((ps, patches, series_hash, snap))
+
+    # preflight all
+    for (ps, patches, _, _) in repo_to_data:
+        if ps.reset_to:
+            reset_repo(ps.repo, ps.reset_to)
+        if ps.ensure_identity:
+            ensure_identity(ps.repo, ps.ensure_identity)
+        if not preflight_apply_check(ps.repo, patches, ps.three_way, ps.ignore_whitespace):
+            print(f"[preflight] failed for {ps.name}. Aborting.")
+            # restore any resets
+            for (pps, _, __, snap) in repo_to_data:
+                reset_repo(pps.repo, snap.head)
+            return 1
+
+    # apply, transactional across repos
+    applied_ok: List[tuple[PatchSet, str, RepoSnapshot]] = []
+    for (ps, patches, series_hash, snap) in repo_to_data:
+        print(f"[apply] {ps.name}: {len(patches)} patches via {ps.method}")
+        rc = apply_with_am(ps.repo, patches, ps.three_way, ps.ignore_whitespace) \
+             if ps.method == "am" else \
+             apply_with_apply(ps.repo, patches, ps.three_way, ps.ignore_whitespace)
+
+        if rc != 0:
+            print(f"[fail] {ps.name} (rc={rc}). Rolling back previously-applied repos...")
+            # rollback those already done
+            for (done_ps, _, done_snap) in applied_ok:
+                reset_repo(done_ps.repo, done_snap.head)
+            # and rollback this one
+            reset_repo(ps.repo, snap.head)
+            return rc
+        # success
+        stamp_applied(ps.repo, ps.name, series_hash)
+        applied_ok.append((ps, series_hash, snap))
+        print(f"[ok] {ps.name} applied (series {series_hash})")
+
+    print("[done] all patchsets applied")
+    return 0
+
+def main(argv: List[str]) -> int:
+    p = argparse.ArgumentParser(
+        prog="patchctl",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent("""
+        patchctl — cross-platform patch orchestrator for multi-repo workspaces.
+
+        Typical usage:
+          patchctl apply -f embedded/patchsets.yaml
+        """)
+    )
+    sub = p.add_subparsers(dest="cmd", required=True)
+    p_apply = sub.add_parser("apply", help="Apply patchsets defined in the manifest")
+    p_apply.add_argument("-f", "--file", required=True, type=Path, help="Manifest path (YAML/JSON)")
+    args = p.parse_args(argv)
+
+    if args.cmd == "apply":
+        return cmd_apply(args.file)
+
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
+