diff --git a/bindings/go/README.txt b/bindings/go/README.txt
index 2fc4afa07715..6ed224d8280e 100644
--- a/bindings/go/README.txt
+++ b/bindings/go/README.txt
@@ -51,3 +51,11 @@ CGO_CPPFLAGS, CGO_CXXFLAGS and CGO_LDFLAGS environment variables:
     $ export CGO_CXXFLAGS=-std=c++11
     $ export CGO_LDFLAGS="`/path/to/llvm-build/bin/llvm-config --ldflags --libs --system-libs all`"
     $ go build -tags byollvm
+
+If you see a compilation error while compiling your code with Go 1.9.4 or later as follows,
+
+    go build llvm.org/llvm/bindings/go/llvm: invalid flag in #cgo LDFLAGS: -Wl,-headerpad_max_install_names
+
+you need to setup $CGO_LDFLAGS_ALLOW to allow a compiler to specify some linker options:
+
+    $ export CGO_LDFLAGS_ALLOW='-Wl,(-search_paths_first|-headerpad_max_install_names)'
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 949ec85c270b..ed867d5a9dc0 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 6.0.0 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 6 release.
-   Release notes for previous releases can be found on
-   `the Download Page <http://releases.llvm.org/download.html>`_.
-
-
 Introduction
 ============
 
@@ -26,19 +20,14 @@ have questions or comments, the `LLVM Developer's Mailing List
 <http://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
-Note that if you are reading this file from a Subversion checkout or the main
-LLVM web page, this document applies to the *next* release, not the current
-one.  To see the release notes for a specific release, please see the `releases
-page <http://llvm.org/releases/>`_.
-
 Non-comprehensive list of changes in this release
 =================================================
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
+
+* Support for `retpolines <https://support.google.com/faqs/answer/7625886>`_
+  was added to help mitigate "branch target injection" (variant #2) of the
+  "Spectre" speculative side channels described by `Project Zero
+  <https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html>`_
+  and the `Spectre paper <https://spectreattack.com/spectre.pdf>`_.
 
 * The ``Redirects`` argument of ``llvm::sys::ExecuteAndWait`` and
   ``llvm::sys::ExecuteNoWait`` was changed to an ``ArrayRef`` of optional
@@ -56,34 +45,33 @@ Non-comprehensive list of changes in this release
 
 * Significantly improved quality of CodeView debug info for Windows.
 
-* Note..
+* Preliminary support for Sanitizers and sibling features on X86(_64) NetBSD
+  (ASan, UBsan, TSan, MSan, SafeStack, libFuzzer).
 
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
-
-   Special New Feature
-   -------------------
-
-   Makes programs 10x faster by doing Special New Thing.
 
 Changes to the LLVM IR
 ----------------------
 
+* The fast-math-flags (FMF) have been updated. Previously, the 'fast' flag
+  indicated that floating-point reassociation was allowed and all other flags
+  were set too. The 'fast' flag still exists, but there is a new flag called
+  'reassoc' to indicate specifically that reassociation is allowed. A new bit
+  called 'afn' was also added to selectively allow approximations for common
+  mathlib functions like square-root. The new flags provide more flexibility
+  to enable/disable specific floating-point optimizations. Making the
+  optimizer respond appropriately to these flags is an ongoing effort.
+
+
 Changes to the AArch64 Target
 -----------------------------
 
-During this release:
+* Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
 
- * Enabled the new GlobalISel instruction selection framework by default at ``-O0``.
 
 Changes to the ARM Target
 -------------------------
 
-During this release the ARM target has:
-
-* Got support for enabling SjLj exception handling on platforms where it
+* Support for enabling SjLj exception handling on platforms where it
   isn't the default.
 
 
@@ -92,12 +80,12 @@ Changes to the Hexagon Target
 
 * The Hexagon backend now supports V65 ISA.
 
-* The ``-mhvx`` option now takes an optional value that specified the ISA
+* The ``-mhvx`` option now takes an optional value that specifies the ISA
   version of the HVX coprocessor.  The available values are v60, v62 and v65.
   By default, the value is set to be the same as the CPU version.
 
 * The compiler option ``-mhvx-double`` is deprecated and will be removed in
-  the next release of the compiler. Programmers should use ``-mhvx-length``
+  the next release of the compiler. Programmers should use the ``-mhvx-length``
   option to specify the desired vector length: ``-mhvx-length=64b`` for
   64-byte vectors and ``-mhvx-length=128b`` for 128-byte vectors. While the
   current default vector length is 64 bytes, users should always specify the
@@ -112,14 +100,46 @@ Changes to the Hexagon Target
 Changes to the MIPS Target
 --------------------------
 
- During this release ...
+Fixed numerous bugs:
+
+* fpowi on MIPS64 giving incorrect results when used with a negative integer.
+* Usage of the asm 'c' constraint with the wrong datatype causing an
+  assert/crash.
+* Fixed a conversion bug when using the DSP ASE.
+* Fixed an inconsistency where objects were not marked as using the microMIPS as
+  when the micromips function attribute or the ".set micromips" directive was
+  used.
+* Reordered the MIPSR6 specific hazard scheduler pass to after the delay slot
+  filler, fixing a class of rare edge case bugs where the delay slot filler
+  would violate ISA restrictions.
+* Fixed a crash when using a type of unknown size with gp relative addressing.
+* Corrected the j macro for microMIPS.
+* Corrected the encoding of movep for microMIPS32r6.
+* Fixed an issue with the usage of insert instructions having an invalid set of
+  operands.
+* Fixed an issue where TLS symbols were not marked as such.
+* Enabled the usage of register scavenging with MSA, due to its shorter offsets
+  for loads and stores.
+* Corrected the ELF headers when using the DSP ASE.
+
+New features:
+
+* The long branch pass now generates some R6 specific instructions when
+  targeting MIPSR6.
+* The delay slot filler now performs more branch conversions if delay slots
+  cannot be filled.
+* The MIPS MT ASE is now fully supported.
+* Added support for the ``lapc`` pseudo instruction.
+* Improved the selection of multiple instructions (``dext``, ``nmadd``,
+  ``nmsub``).
+* Further improved microMIPS codesize reduction.
+
+Deprecation notices:
+
+* microMIPS64R6 support was been deprecated since 5.0, and has now been
+  completely removed.
 
 
-Changes to the PowerPC Target
------------------------------
-
- During this release ...
-
 Changes to the SystemZ Target
 -----------------------------
 
@@ -132,36 +152,66 @@ During this release the SystemZ target has:
 Changes to the X86 Target
 -------------------------
 
-During this release ...
+During this release the X86 target has:
 
-* Got support for enabling SjLj exception handling on platforms where it
+* Added support for enabling SjLj exception handling on platforms where it
   isn't the default.
 
-Changes to the AMDGPU Target
------------------------------
+* Added intrinsics for Intel Extensions: VAES, GFNI, VPCLMULQDQ, AVX512VBMI2, AVX512BITALG, AVX512VNNI.
 
- During this release ...
+* Added support for Intel Icelake CPU.
 
-Changes to the AVR Target
------------------------------
+* Fixed some X87 codegen bugs.
 
- During this release ...
+* Added instruction scheduling information for Intel Sandy Bridge, Ivy Bridge, Haswell, Broadwell, and Skylake CPUs.
 
-Changes to the OCaml bindings
------------------------------
+* Improved scheduler model for AMD Jaguar CPUs.
+
+* Improved llvm-mc's disassembler for some EVEX encoded instructions.
+
+* Add support for i8 and i16 vector signed/unsigned min/max horizontal reductions.
+
+* Improved codegen for memory comparisons
+
+* Improved codegen for i32 vector multiplies
+
+* Improved codegen for scalar integer absolute values
+
+* Improved codegen for vector integer rotations (XOP and AVX512)
 
- During this release ...
+* Improved codegen of data being transferred between GPRs and K-registers.
 
+* Improved codegen for vector truncations.
 
-Changes to the C API
---------------------
+* Improved folding of address computations into gather/scatter instructions.
 
- During this release ...
+* Gained initial support recognizing variable shuffles from vector element extracts and inserts.
+
+* Improved documentation for SSE/AVX intrinsics in intrin.h header files.
+
+* Gained support for emitting `retpolines
+  <https://support.google.com/faqs/answer/7625886>`_, including automatic
+  insertion of the necessary thunks or using external thunks.
 
 
 External Open Source Projects Using LLVM 6
 ==========================================
 
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
+
 JFS - JIT Fuzzing Solver
 ------------------------
 
@@ -188,21 +238,6 @@ import of .h symbols - even inline functions and macros. Zig uses LLD combined
 with lazily building compiler-rt to provide out-of-the-box cross-compiling for
 all supported targets.
 
-LDC - the LLVM-based D compiler
--------------------------------
-
-`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
-pragmatically combines efficiency, control, and modeling power, with safety and
-programmer productivity. D supports powerful concepts like Compile-Time Function
-Execution (CTFE) and Template Meta-Programming, provides an innovative approach
-to concurrency and offers many classical paradigms.
-
-`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
-combined with LLVM as backend to produce efficient native code. LDC targets
-x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
-and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
-are underway.
-
 Additional Information
 ======================
 
diff --git a/docs/index.rst b/docs/index.rst
index 47c2f0473931..de53b0df6906 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,6 @@
 Overview
 ========
 
-.. warning::
-
-   If you are using a released version of LLVM, see `the download page
-   <http://llvm.org/releases/>`_ to find your documentation.
-
 The LLVM compiler infrastructure supports a wide range of projects, from
 industrial strength compilers to specialized JIT applications to small
 research projects.
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index 70194c043479..01419d7ae2bf 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -395,6 +395,20 @@ enum OverflowingBinaryOperatorOptionalFlags {
   OBO_NO_SIGNED_WRAP = 1
 };
 
+/// FastMath Flags
+/// This is a fixed layout derived from the bitcode emitted by LLVM 5.0
+/// intended to decouple the in-memory representation from the serialization.
+enum FastMathMap {
+  UnsafeAlgebra   = (1 << 0), // Legacy
+  NoNaNs          = (1 << 1),
+  NoInfs          = (1 << 2),
+  NoSignedZeros   = (1 << 3),
+  AllowReciprocal = (1 << 4),
+  AllowContract   = (1 << 5),
+  ApproxFunc      = (1 << 6),
+  AllowReassoc    = (1 << 7)
+};
+
 /// PossiblyExactOperatorOptionalFlags - Flags for serializing
 /// PossiblyExactOperator's SubclassOptionalData contents.
 enum PossiblyExactOperatorOptionalFlags { PEO_EXACT = 0 };
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index bd6177c5b3d9..7c000e2b1dc7 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
               Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
                          [IntrNoMem]>;
+  def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
+              Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+              Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+                         [IntrNoMem]>;
   def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
               Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
                         [IntrNoMem]>;
diff --git a/include/llvm/MC/MCAsmMacro.h b/include/llvm/MC/MCAsmMacro.h
new file mode 100644
index 000000000000..dac8d1a80050
--- /dev/null
+++ b/include/llvm/MC/MCAsmMacro.h
@@ -0,0 +1,38 @@
+//===- MCAsmMacro.h - Assembly Macros ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMMACRO_H
+#define LLVM_MC_MCASMMACRO_H
+
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+
+namespace llvm {
+
+struct MCAsmMacroParameter {
+  StringRef Name;
+  std::vector<AsmToken> Value;
+  bool Required = false;
+  bool Vararg = false;
+
+  MCAsmMacroParameter() = default;
+};
+
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
+struct MCAsmMacro {
+  StringRef Name;
+  StringRef Body;
+  MCAsmMacroParameters Parameters;
+
+public:
+  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
+      : Name(N), Body(B), Parameters(std::move(P)) {}
+};
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 432fc0ede072..358f67c4db6d 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCAsmMacro.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SectionKind.h"
@@ -268,6 +269,9 @@ namespace llvm {
                                        unsigned UniqueID,
                                        const MCSymbolELF *Associated);
 
+    /// \brief Map of currently defined macros.
+    StringMap<MCAsmMacro> MacroMap;
+
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
@@ -618,6 +622,17 @@ namespace llvm {
     // FIXME: We should really do something about that.
     LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
                                                   const Twine &Msg);
+
+    const MCAsmMacro *lookupMacro(StringRef Name) {
+      StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
+      return (I == MacroMap.end()) ? nullptr : &I->getValue();
+    }
+
+    void defineMacro(StringRef Name, MCAsmMacro Macro) {
+      MacroMap.insert(std::make_pair(Name, std::move(Macro)));
+    }
+
+    void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 25175fe66aa8..9438c9e08850 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -698,24 +698,20 @@ struct SemiNCAInfo {
       return;
 
     // Recalculate the set of roots.
-    DT.Roots = FindRoots(DT, BUI);
-    for (const NodePtr R : DT.Roots) {
-      const TreeNodePtr TN = DT.getNode(R);
-      // A CFG node was selected as a tree root, but the corresponding tree node
-      // is not connected to the virtual root. This is because the incremental
-      // algorithm does not really know or use the set of roots and can make a
-      // different (implicit) decision about which nodes within an infinite loop
-      // becomes a root.
-      if (TN && !DT.isVirtualRoot(TN->getIDom())) {
-        DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
-                     << " is not virtual root's child\n"
-                     << "The entire tree needs to be rebuilt\n");
-        // It should be possible to rotate the subtree instead of recalculating
-        // the whole tree, but this situation happens extremely rarely in
-        // practice.
-        CalculateFromScratch(DT, BUI);
-        return;
-      }
+    auto Roots = FindRoots(DT, BUI);
+    if (DT.Roots.size() != Roots.size() ||
+        !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
+      // The roots chosen in the CFG have changed. This is because the
+      // incremental algorithm does not really know or use the set of roots and
+      // can make a different (implicit) decision about which node within an
+      // infinite loop becomes a root.
+
+      DEBUG(dbgs() << "Roots are different in updated trees\n"
+                   << "The entire tree needs to be rebuilt\n");
+      // It may be possible to update the tree without recalculating it, but
+      // we do not know yet how to do it, and it happens rarely in practise.
+      CalculateFromScratch(DT, BUI);
+      return;
     }
   }
 
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index 750666136507..fb53647112f9 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -172,15 +173,25 @@ class RecurrenceDescriptor {
                                Value *Left, Value *Right);
 
   /// Returns true if Phi is a reduction of type Kind and adds it to the
-  /// RecurrenceDescriptor.
+  /// RecurrenceDescriptor. If either \p DB is non-null or \p AC and \p DT are
+  /// non-null, the minimal bit width needed to compute the reduction will be
+  /// computed.
   static bool AddReductionVar(PHINode *Phi, RecurrenceKind Kind, Loop *TheLoop,
                               bool HasFunNoNaNAttr,
-                              RecurrenceDescriptor &RedDes);
-
-  /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor is
-  /// returned in RedDes.
+                              RecurrenceDescriptor &RedDes,
+                              DemandedBits *DB = nullptr,
+                              AssumptionCache *AC = nullptr,
+                              DominatorTree *DT = nullptr);
+
+  /// Returns true if Phi is a reduction in TheLoop. The RecurrenceDescriptor
+  /// is returned in RedDes. If either \p DB is non-null or \p AC and \p DT are
+  /// non-null, the minimal bit width needed to compute the reduction will be
+  /// computed.
   static bool isReductionPHI(PHINode *Phi, Loop *TheLoop,
-                             RecurrenceDescriptor &RedDes);
+                             RecurrenceDescriptor &RedDes,
+                             DemandedBits *DB = nullptr,
+                             AssumptionCache *AC = nullptr,
+                             DominatorTree *DT = nullptr);
 
   /// Returns true if Phi is a first-order recurrence. A first-order recurrence
   /// is a non-reduction recurrence relation in which the value of the
@@ -218,24 +229,6 @@ class RecurrenceDescriptor {
   /// Returns true if the recurrence kind is an arithmetic kind.
   static bool isArithmeticRecurrenceKind(RecurrenceKind Kind);
 
-  /// Determines if Phi may have been type-promoted. If Phi has a single user
-  /// that ANDs the Phi with a type mask, return the user. RT is updated to
-  /// account for the narrower bit width represented by the mask, and the AND
-  /// instruction is added to CI.
-  static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT,
-                                     SmallPtrSetImpl<Instruction *> &Visited,
-                                     SmallPtrSetImpl<Instruction *> &CI);
-
-  /// Returns true if all the source operands of a recurrence are either
-  /// SExtInsts or ZExtInsts. This function is intended to be used with
-  /// lookThroughAnd to determine if the recurrence has been type-promoted. The
-  /// source operands are added to CI, and IsSigned is updated to indicate if
-  /// all source operands are SExtInsts.
-  static bool getSourceExtensionKind(Instruction *Start, Instruction *Exit,
-                                     Type *RT, bool &IsSigned,
-                                     SmallPtrSetImpl<Instruction *> &Visited,
-                                     SmallPtrSetImpl<Instruction *> &CI);
-
   /// Returns the type of the recurrence. This type can be narrower than the
   /// actual type of the Phi if the recurrence has been type-promoted.
   Type *getRecurrenceType() { return RecurrenceType; }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 10b5c74e378b..bfff7afb5b4e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -205,6 +205,11 @@ static cl::opt<unsigned>
                   cl::desc("Max coefficients in AddRec during evolving"),
                   cl::init(16));
 
+static cl::opt<bool> VersionUnknown(
+    "scev-version-unknown", cl::Hidden,
+    cl::desc("Use predicated scalar evolution to version SCEVUnknowns"),
+    cl::init(false));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -11467,6 +11472,8 @@ class SCEVPredicateRewriter : public SCEVRewriteVisitor<SCEVPredicateRewriter> {
   // couldn't create an AddRec for it, or couldn't add the predicate), we just
   // return \p Expr.
   const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) {
+    if (!VersionUnknown)
+      return Expr;
     if (!isa<PHINode>(Expr->getValue()))
       return Expr;
     Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 95291a1caf9a..945ac4515368 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1046,19 +1046,21 @@ static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
 
 static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
   FastMathFlags FMF;
-  if (0 != (Val & FastMathFlags::AllowReassoc))
+  if (0 != (Val & bitc::UnsafeAlgebra))
+    FMF.setFast();
+  if (0 != (Val & bitc::AllowReassoc))
     FMF.setAllowReassoc();
-  if (0 != (Val & FastMathFlags::NoNaNs))
+  if (0 != (Val & bitc::NoNaNs))
     FMF.setNoNaNs();
-  if (0 != (Val & FastMathFlags::NoInfs))
+  if (0 != (Val & bitc::NoInfs))
     FMF.setNoInfs();
-  if (0 != (Val & FastMathFlags::NoSignedZeros))
+  if (0 != (Val & bitc::NoSignedZeros))
     FMF.setNoSignedZeros();
-  if (0 != (Val & FastMathFlags::AllowReciprocal))
+  if (0 != (Val & bitc::AllowReciprocal))
     FMF.setAllowReciprocal();
-  if (0 != (Val & FastMathFlags::AllowContract))
+  if (0 != (Val & bitc::AllowContract))
     FMF.setAllowContract(true);
-  if (0 != (Val & FastMathFlags::ApproxFunc))
+  if (0 != (Val & bitc::ApproxFunc))
     FMF.setApproxFunc();
   return FMF;
 }
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index a7201ed97350..7bf37857eb97 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -1330,19 +1330,19 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= 1 << bitc::PEO_EXACT;
   } else if (const auto *FPMO = dyn_cast<FPMathOperator>(V)) {
     if (FPMO->hasAllowReassoc())
-      Flags |= FastMathFlags::AllowReassoc;
+      Flags |= bitc::AllowReassoc;
     if (FPMO->hasNoNaNs())
-      Flags |= FastMathFlags::NoNaNs;
+      Flags |= bitc::NoNaNs;
     if (FPMO->hasNoInfs())
-      Flags |= FastMathFlags::NoInfs;
+      Flags |= bitc::NoInfs;
     if (FPMO->hasNoSignedZeros())
-      Flags |= FastMathFlags::NoSignedZeros;
+      Flags |= bitc::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
-      Flags |= FastMathFlags::AllowReciprocal;
+      Flags |= bitc::AllowReciprocal;
     if (FPMO->hasAllowContract())
-      Flags |= FastMathFlags::AllowContract;
+      Flags |= bitc::AllowContract;
     if (FPMO->hasApproxFunc())
-      Flags |= FastMathFlags::ApproxFunc;
+      Flags |= bitc::ApproxFunc;
   }
 
   return Flags;
@@ -3183,7 +3183,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); // flags
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_BINOP_FLAGS_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 75e3d35169cf..42d93a007d77 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -514,6 +514,39 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
     return false;
   }
 
+  // Detect invalid DBG_VALUE instructions, with a debug-use of a virtual
+  // register that hasn't been defined yet. If we do not remove those here, then
+  // the re-insertion of the DBG_VALUE instruction after register allocation
+  // will be incorrect.
+  // TODO: If earlier passes are corrected to generate sane debug information
+  // (and if the machine verifier is improved to catch this), then these checks
+  // could be removed or replaced by asserts.
+  bool Discard = false;
+  if (MI.getOperand(0).isReg() &&
+      TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
+    const unsigned Reg = MI.getOperand(0).getReg();
+    if (!LIS->hasInterval(Reg)) {
+      // The DBG_VALUE is described by a virtual register that does not have a
+      // live interval. Discard the DBG_VALUE.
+      Discard = true;
+      DEBUG(dbgs() << "Discarding debug info (no LIS interval): "
+            << Idx << " " << MI);
+    } else {
+      // The DBG_VALUE is only valid if either Reg is live out from Idx, or Reg
+      // is defined dead at Idx (where Idx is the slot index for the instruction
+      // preceeding the DBG_VALUE).
+      const LiveInterval &LI = LIS->getInterval(Reg);
+      LiveQueryResult LRQ = LI.Query(Idx);
+      if (!LRQ.valueOutOrDead()) {
+        // We have found a DBG_VALUE with the value in a virtual register that
+        // is not live. Discard the DBG_VALUE.
+        Discard = true;
+        DEBUG(dbgs() << "Discarding debug info (reg not live): "
+              << Idx << " " << MI);
+      }
+    }
+  }
+
   // Get or create the UserValue for (variable,offset) here.
   bool IsIndirect = MI.getOperand(1).isImm();
   if (IsIndirect)
@@ -522,7 +555,10 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   const DIExpression *Expr = MI.getDebugExpression();
   UserValue *UV =
       getUserValue(Var, Expr, MI.getDebugLoc());
-  UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+  if (!Discard)
+    UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+  else
+    UV->addDef(Idx, MachineOperand::CreateReg(0U, RegState::Debug), false);
   return true;
 }
 
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index c258d1a4e3ad..c56a022c6705 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name=="ssse3.pabs.d.128" || // Added in 6.0
       Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
       Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
-      Name.startswith("avx512.kunpck") || //added in 6.0 
       Name.startswith("avx2.pabs.") || // Added in 6.0
       Name.startswith("avx512.mask.pabs.") || // Added in 6.0
       Name.startswith("avx512.broadcastm") || // Added in 6.0
@@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
-    } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
-      uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
-      uint64_t And = (1ULL << Shift) - 1; 
-      Value* LowBits =  Builder.CreateAnd(CI->getArgOperand(0), And);
-      Value* HighBits =  Builder.CreateShl(CI->getArgOperand(1), Shift);
-      Rep = Builder.CreateOr(LowBits, HighBits);
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
       Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2259136c6ec4..ce3b70bed740 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -83,27 +83,6 @@ namespace {
 typedef std::vector<AsmToken> MCAsmMacroArgument;
 typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 
-struct MCAsmMacroParameter {
-  StringRef Name;
-  MCAsmMacroArgument Value;
-  bool Required = false;
-  bool Vararg = false;
-
-  MCAsmMacroParameter() = default;
-};
-
-typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
-
-struct MCAsmMacro {
-  StringRef Name;
-  StringRef Body;
-  MCAsmMacroParameters Parameters;
-
-public:
-  MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
-      : Name(N), Body(B), Parameters(std::move(P)) {}
-};
-
 /// \brief Helper class for storing information about an active macro
 /// instantiation.
 struct MacroInstantiation {
@@ -164,9 +143,6 @@ class AsmParser : public MCAsmParser {
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// \brief Map of currently defined macros.
-  StringMap<MCAsmMacro> MacroMap;
-
   /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
@@ -308,17 +284,6 @@ class AsmParser : public MCAsmParser {
   /// \brief Control a flag in the parser that enables or disables macros.
   void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
-  /// \brief Lookup a previously defined macro.
-  /// \param Name Macro name.
-  /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* lookupMacro(StringRef Name);
-
-  /// \brief Define a new macro with the given name and information.
-  void defineMacro(StringRef Name, MCAsmMacro Macro);
-
-  /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void undefineMacro(StringRef Name);
-
   /// \brief Are we inside a macro instantiation?
   bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
@@ -1841,7 +1806,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
   // If macros are enabled, check to see if this is a macro instantiation.
   if (areMacrosEnabled())
-    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+    if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
       return handleMacroEntry(M, IDLoc);
     }
 
@@ -2720,17 +2685,6 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
   return TokError("too many positional arguments");
 }
 
-const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
-  return (I == MacroMap.end()) ? nullptr : &I->getValue();
-}
-
-void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
-  MacroMap.insert(std::make_pair(Name, std::move(Macro)));
-}
-
-void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
-
 bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth (default matches 'as'). We can
   // eliminate this, although we should protect against infinite loops.
@@ -4249,7 +4203,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (lookupMacro(Name)) {
+  if (getContext().lookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
@@ -4257,7 +4211,7 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
   checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
+  getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
   return false;
 }
 
@@ -4416,10 +4370,10 @@ bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
                  "unexpected token in '.purgem' directive"))
     return true;
 
-  if (!lookupMacro(Name))
+  if (!getContext().lookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  undefineMacro(Name);
+  getContext().undefineMacro(Name);
   return false;
 }
 
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 5723f8fcf5bb..d968688911eb 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -4,7 +4,8 @@ if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
 endif()
 if( MSVC OR MINGW )
   # libuuid required for FOLDERID_Profile usage in lib/Support/Windows/Path.inc.
-  set(system_libs ${system_libs} psapi shell32 ole32 uuid)
+  # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc.
+  set(system_libs ${system_libs} psapi shell32 ole32 uuid advapi32)
 elseif( CMAKE_HOST_UNIX )
   if( HAVE_LIBRT )
     set(system_libs ${system_libs} rt)
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 2bb9e381073a..7d2ec1be2888 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -133,16 +133,21 @@ AArch64InstructionSelector::AArch64InstructionSelector(
 // for each class in the bank.
 static const TargetRegisterClass *
 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
-                         const RegisterBankInfo &RBI) {
+                         const RegisterBankInfo &RBI,
+                         bool GetAllRegSet = false) {
   if (RB.getID() == AArch64::GPRRegBankID) {
     if (Ty.getSizeInBits() <= 32)
-      return &AArch64::GPR32RegClass;
+      return GetAllRegSet ? &AArch64::GPR32allRegClass
+                          : &AArch64::GPR32RegClass;
     if (Ty.getSizeInBits() == 64)
-      return &AArch64::GPR64RegClass;
+      return GetAllRegSet ? &AArch64::GPR64allRegClass
+                          : &AArch64::GPR64RegClass;
     return nullptr;
   }
 
   if (RB.getID() == AArch64::FPRRegBankID) {
+    if (Ty.getSizeInBits() <= 16)
+      return &AArch64::FPR16RegClass;
     if (Ty.getSizeInBits() == 32)
       return &AArch64::FPR32RegClass;
     if (Ty.getSizeInBits() == 64)
@@ -310,19 +315,46 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
   return GenericOpc;
 }
 
+static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII,
+                                    MachineRegisterInfo &MRI, unsigned SrcReg) {
+  // Copies from gpr32 to fpr16 need to use a sub-register copy.
+  unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY))
+      .addDef(CopyReg)
+      .addUse(SrcReg);
+  unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+      .addDef(SubRegCopy)
+      .addUse(CopyReg, 0, AArch64::hsub);
+
+  MachineOperand &RegOp = I.getOperand(1);
+  RegOp.setReg(SubRegCopy);
+  return true;
+}
+
 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                        const RegisterBankInfo &RBI) {
 
   unsigned DstReg = I.getOperand(0).getReg();
+  unsigned SrcReg = I.getOperand(1).getReg();
+
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) &&
+        !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+      const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+      const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(
+          MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true);
+      if (SrcRC == &AArch64::GPR32allRegClass)
+        return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
+    }
     assert(I.isCopy() && "Generic operators do not allow physical registers");
     return true;
   }
 
   const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  unsigned SrcReg = I.getOperand(1).getReg();
+  (void)DstSize;
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
   (void)SrcSize;
   assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
@@ -340,26 +372,38 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
       "Copy with different width?!");
   assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
          "GPRs cannot get more than 64-bit width values");
-  const TargetRegisterClass *RC = nullptr;
-
-  if (RegBank.getID() == AArch64::FPRRegBankID) {
-    if (DstSize <= 16)
-      RC = &AArch64::FPR16RegClass;
-    else if (DstSize <= 32)
-      RC = &AArch64::FPR32RegClass;
-    else if (DstSize <= 64)
-      RC = &AArch64::FPR64RegClass;
-    else if (DstSize <= 128)
-      RC = &AArch64::FPR128RegClass;
-    else {
-      DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
-      return false;
+
+  const TargetRegisterClass *RC = getRegClassForTypeOnBank(
+      MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true);
+  if (!RC) {
+    DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+    return false;
+  }
+
+  if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg);
+    const TargetRegisterClass *SrcRC =
+        RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+    const RegisterBank *RB = nullptr;
+    if (!SrcRC) {
+      RB = RegClassOrBank.get<const RegisterBank *>();
+      SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true);
+    }
+    // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG.
+    if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) {
+      unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+      BuildMI(*I.getParent(), I, I.getDebugLoc(),
+              TII.get(AArch64::SUBREG_TO_REG))
+          .addDef(PromoteReg)
+          .addImm(0)
+          .addUse(SrcReg)
+          .addImm(AArch64::hsub);
+      MachineOperand &RegOp = I.getOperand(1);
+      RegOp.setReg(PromoteReg);
+    } else if (RC == &AArch64::FPR16RegClass &&
+               SrcRC == &AArch64::GPR32allRegClass) {
+      selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
     }
-  } else {
-    assert(RegBank.getID() == AArch64::GPRRegBankID &&
-           "Bitcast for the flags?");
-    RC =
-        DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
   }
 
   // No need to constrain SrcReg. It will get constrained when
@@ -795,15 +839,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
   }
   case TargetOpcode::G_EXTRACT: {
     LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    (void)DstTy;
+    unsigned SrcSize = SrcTy.getSizeInBits();
     // Larger extracts are vectors, same-size extracts should be something else
     // by now (either split up or simplified to a COPY).
     if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
       return false;
 
-    I.setDesc(TII.get(AArch64::UBFMXri));
+    I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
                                       Ty.getSizeInBits() - 1);
 
+    if (SrcSize < 64) {
+      assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
+             "unexpected G_EXTRACT types");
+      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    }
+
     unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
     BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
             TII.get(AArch64::COPY))
@@ -818,17 +871,26 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
 
   case TargetOpcode::G_INSERT: {
     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    unsigned DstSize = DstTy.getSizeInBits();
+    (void)DstSize;
     // Larger inserts are vectors, same-size ones should be something else by
     // now (split up or turned into COPYs).
     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
       return false;
 
-    I.setDesc(TII.get(AArch64::BFMXri));
+    I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
     unsigned LSB = I.getOperand(3).getImm();
     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
-    I.getOperand(3).setImm((64 - LSB) % 64);
+    I.getOperand(3).setImm((DstSize - LSB) % DstSize);
     MachineInstrBuilder(MF, I).addImm(Width - 1);
 
+    if (DstSize < 64) {
+      assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
+             "unexpected G_INSERT types");
+      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    }
+
     unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
             TII.get(AArch64::SUBREG_TO_REG))
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 8156599528c2..61892efe39e0 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -108,3 +108,21 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   return MCOp;
 }
+
+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
+    return AMDGPU::isArgPassedInSGPR(Arg);
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index a9fcd4834638..74e14ef8fbd8 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -50,6 +50,8 @@ class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
   int pseudoToMCOpcode(int Opcode) const;
+
+  static bool isUniformMMO(const MachineMemOperand *MMO);
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1ed02fae085a..e3df6d9bee88 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -120,7 +120,7 @@ static bool isInstrUniform(const MachineInstr &MI) {
     return false;
 
   const MachineMemOperand *MMO = *MI.memoperands_begin();
-  return AMDGPU::isUniformMMO(MMO);
+  return AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
 const RegisterBankInfo::InstructionMapping &
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c2d79b9ef5f4..6d89aa6968e9 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1086,7 +1086,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
 
-  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2c127d787260..654b96f792b1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3797,7 +3797,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         }
       }
 
-      BuildMI(*MBB, Inst, Inst.getDebugLoc(),
+      MachineInstr *NewInstr =
+        BuildMI(*MBB, Inst, Inst.getDebugLoc(),
               get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst)
         .add(*VAddr) // vaddr
         .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc
@@ -3806,12 +3807,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm())
         .addImm(0) // slc
         .addImm(0) // tfe
-        .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end());
+        .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end())
+        .getInstr();
 
       MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(),
                          VDst);
       addUsersToMoveToVALUWorklist(VDst, MRI, Worklist);
       Inst.eraseFromParent();
+
+      // Legalize all operands other than the offset. Notably, convert the srsrc
+      // into SGPRs using v_readfirstlane if needed.
+      legalizeOperands(*NewInstr);
       continue;
     }
     }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 125a3b22d0cf..bf9d5bc6ebdc 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -871,24 +871,6 @@ bool isArgPassedInSGPR(const Argument *A) {
   }
 }
 
-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
-bool isUniformMMO(const MachineMemOperand *MMO) {
-  const Value *Ptr = MMO->getValue();
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
-
-  if (const Argument *Arg = dyn_cast<Argument>(Ptr))
-    return isArgPassedInSGPR(Arg);
-
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
-}
-
 int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
   if (isGCN3Encoding(ST))
     return ByteOffset;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a215b445378e..9515001b63d2 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -363,7 +363,6 @@ LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 bool isArgPassedInSGPR(const Argument *Arg);
-bool isUniformMMO(const MachineMemOperand *MMO);
 
 /// \returns The encoding that will be used for \p ByteOffset in the SMRD
 /// offset field.
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index fc638829378a..1d10ef9acfba 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -454,13 +454,16 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
         return true;
     }
 
+    // FREM is always a call.
+    if (J->getOpcode() == Instruction::FRem)
+      return true;
+
     if (STI->useSoftFloat()) {
       switch(J->getOpcode()) {
       case Instruction::FAdd:
       case Instruction::FSub:
       case Instruction::FMul:
       case Instruction::FDiv:
-      case Instruction::FRem:
       case Instruction::FPTrunc:
       case Instruction::FPExt:
       case Instruction::FPToUI:
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a7059c6914df..4ddc1f0ba429 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -396,10 +396,14 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
 
     // rip-relative addressing is actually relative to the *next* instruction.
     // Since an immediate can follow the mod/rm byte for an instruction, this
-    // means that we need to bias the immediate field of the instruction with
-    // the size of the immediate field.  If we have this case, add it into the
+    // means that we need to bias the displacement field of the instruction with
+    // the size of the immediate field. If we have this case, add it into the
     // expression to emit.
-    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+    // Note: rip-relative addressing using immediate displacement values should
+    // not be adjusted, assuming it was the user's intent.
+    int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
+                      ? X86II::getSizeOfImm(TSFlags)
+                      : 0;
 
     EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
                   CurByte, OS, Fixups, -ImmSize);
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index ba97982e3330..cc4c8823c3da 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -740,7 +740,13 @@ class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
 def : SkylakeServerProc<"skylake-avx512">;
 def : SkylakeServerProc<"skx">; // Legacy alias.
 
-def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
+def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
+  FeatureAVX512,
+  FeatureCDI,
+  FeatureDQI,
+  FeatureBWI,
+  FeatureVLX,
+  FeaturePKU,
   FeatureVBMI,
   FeatureIFMA,
   FeatureSHA
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 71526dd77f11..2a501efbc1bf 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -370,6 +370,8 @@ static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
   unsigned Reg = MO.getReg();
+  bool EmitPercent = true;
+
   switch (Mode) {
   default: return true;  // Unknown mode.
   case 'b': // Print QImode register
@@ -384,6 +386,9 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   case 'k': // Print SImode register
     Reg = getX86SubSuperRegister(Reg, 32);
     break;
+  case 'V':
+    EmitPercent = false;
+    LLVM_FALLTHROUGH;
   case 'q':
     // Print 64-bit register names if 64-bit integer registers are available.
     // Otherwise, print 32-bit register names.
@@ -391,7 +396,10 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
     break;
   }
 
-  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  if (EmitPercent)
+    O << '%';
+
+  O << X86ATTInstPrinter::getRegisterName(Reg);
   return false;
 }
 
@@ -464,6 +472,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     case 'w': // Print HImode register
     case 'k': // Print SImode register
     case 'q': // Print DImode register
+    case 'V': // Print native register without '%'
       if (MO.isReg())
         return printAsmMRegister(*this, MO, ExtraCode[0], O);
       printOperand(*this, MI, OpNo, O);
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index ba7280c29cc9..bc0f55f581ff 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -663,8 +663,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::XOR32rr, X86::KXORDrr);
     createReplacer(X86::XOR64rr, X86::KXORQrr);
 
-    createReplacer(X86::TEST32rr, X86::KTESTDrr);
-    createReplacer(X86::TEST64rr, X86::KTESTQrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST32rr, X86::KTESTDrr);
+    //createReplacer(X86::TEST64rr, X86::KTESTQrr);
   }
 
   if (STI->hasDQI()) {
@@ -684,8 +686,10 @@ void X86DomainReassignment::initConverters() {
     createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
     createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
 
-    createReplacer(X86::TEST8rr, X86::KTESTBrr);
-    createReplacer(X86::TEST16rr, X86::KTESTWrr);
+    // TODO: KTEST is not a replacement for TEST due to flag differences. Need
+    // to prove only Z flag is used.
+    //createReplacer(X86::TEST8rr, X86::KTESTBrr);
+    //createReplacer(X86::TEST16rr, X86::KTESTWrr);
 
     createReplacer(X86::XOR8rr, X86::KXORBrr);
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38885c42b529..10e19f92b4a6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17017,24 +17017,6 @@ static bool hasNonFlagsUse(SDValue Op) {
   return false;
 }
 
-// Emit KTEST instruction for bit vectors on AVX-512
-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
-                         const X86Subtarget &Subtarget) {
-  if (Op.getOpcode() == ISD::BITCAST) {
-    auto hasKTEST = [&](MVT VT) {
-      unsigned SizeInBits = VT.getSizeInBits();
-      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
-        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
-    };
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType().getSimpleVT();
-    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
-        hasKTEST(Op0VT))
-      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
-  }
-  return SDValue();
-}
-
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -17079,9 +17061,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -17310,10 +17289,6 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
   }
 
   if (Opcode == 0) {
-    // Emit KTEST for bit vectors
-    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
-      return Node;
-
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
@@ -18093,6 +18068,34 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   return Result;
 }
 
+// Try to select this as a KTEST+SETCC if possible.
+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                         const SDLoc &dl, SelectionDAG &DAG,
+                         const X86Subtarget &Subtarget) {
+  // Only support equality comparisons.
+  if (CC != ISD::SETEQ && CC != ISD::SETNE)
+    return SDValue();
+
+  // Must be a bitcast from vXi1.
+  if (Op0.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  Op0 = Op0.getOperand(0);
+  MVT VT = Op0.getSimpleValueType();
+  if (!(Subtarget.hasDQI() && (VT == MVT::v8i1  || VT == MVT::v16i1)) &&
+      !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
+    return SDValue();
+
+  X86::CondCode X86CC;
+  if (isNullConstant(Op1)) {
+    X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
+  } else
+    return SDValue();
+
+  SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
+  return getSETCC(X86CC, KTEST, dl, DAG);
+}
+
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   MVT VT = Op.getSimpleValueType();
@@ -18115,6 +18118,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       return NewSetCC;
   }
 
+  // Try to lower using KTEST.
+  if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
+    return NewSetCC;
+
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
@@ -20525,6 +20532,18 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       Mask = DAG.getBitcast(MaskVT, Mask);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
     }
+    case KUNPCK: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      // Arguments should be swapped.
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
+                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
+                                Src2, Src1);
+      return DAG.getBitcast(VT, Res);
+    }
     case MASK_BINOP: {
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -27094,28 +27113,57 @@ static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
 
 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
                                       unsigned Reg) {
+  if (Subtarget.useRetpolineExternalThunk()) {
+    // When using an external thunk for retpolines, we pick names that match the
+    // names GCC happens to use as well. This helps simplify the implementation
+    // of the thunks for kernels where they have no easy ability to create
+    // aliases and are doing non-trivial configuration of the thunk's body. For
+    // example, the Linux kernel will do boot-time hot patching of the thunk
+    // bodies and cannot easily export aliases of these to loaded modules.
+    //
+    // Note that at any point in the future, we may need to change the semantics
+    // of how we implement retpolines and at that time will likely change the
+    // name of the called thunk. Essentially, there is no hard guarantee that
+    // LLVM will generate calls to specific thunks, we merely make a best-effort
+    // attempt to help out kernels and other systems where duplicating the
+    // thunks is costly.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__x86_indirect_thunk_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__x86_indirect_thunk_r11";
+    }
+    llvm_unreachable("unexpected reg for retpoline");
+  }
+
+  // When targeting an internal COMDAT thunk use an LLVM-specific name.
   switch (Reg) {
-  case 0:
-    assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_push"
-               : "__llvm_retpoline_push";
   case X86::EAX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_eax"
-               : "__llvm_retpoline_eax";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_eax";
   case X86::ECX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_ecx"
-               : "__llvm_retpoline_ecx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_ecx";
   case X86::EDX:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_edx"
-               : "__llvm_retpoline_edx";
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edx";
+  case X86::EDI:
+    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+    return "__llvm_retpoline_edi";
   case X86::R11:
-    return Subtarget.useRetpolineExternalThunk()
-               ? "__llvm_external_retpoline_r11"
-               : "__llvm_retpoline_r11";
+    assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+    return "__llvm_retpoline_r11";
   }
   llvm_unreachable("unexpected reg for retpoline");
 }
@@ -27134,15 +27182,13 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
   // just use R11, but we scan for uses anyway to ensure we don't generate
   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
   // already a register use operand to the call to hold the callee. If none
-  // are available, push the callee instead. This is less efficient, but is
-  // necessary for functions using 3 regparms. Such function calls are
-  // (currently) not eligible for tail call optimization, because there is no
-  // scratch register available to hold the address of the callee.
+  // are available, use EDI instead. EDI is chosen because EBX is the PIC base
+  // register and ESI is the base pointer to realigned stack frames with VLAs.
   SmallVector<unsigned, 3> AvailableRegs;
   if (Subtarget.is64Bit())
     AvailableRegs.push_back(X86::R11);
   else
-    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
+    AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
 
   // Zero out any registers that are already used.
   for (const auto &MO : MI.operands()) {
@@ -27160,30 +27206,18 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
       break;
     }
   }
+  if (!AvailableReg)
+    report_fatal_error("calling convention incompatible with retpoline, no "
+                       "available registers");
 
   const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
 
-  if (AvailableReg == 0) {
-    // No register available. Use PUSH. This must not be a tailcall, and this
-    // must not be x64.
-    if (Subtarget.is64Bit())
-      report_fatal_error(
-          "Cannot make an indirect call on x86-64 using both retpoline and a "
-          "calling convention that preservers r11");
-    if (Opc != X86::CALLpcrel32)
-      report_fatal_error("Cannot make an indirect tail call on x86 using "
-                         "retpoline without a preserved register");
-    BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-  } else {
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
-        .addReg(CalleeVReg);
-    MI.getOperand(0).ChangeToES(Symbol);
-    MI.setDesc(TII->get(Opc));
-    MachineInstrBuilder(*BB->getParent(), &MI)
-        .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
-  }
+  BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
+      .addReg(CalleeVReg);
+  MI.getOperand(0).ChangeToES(Symbol);
+  MI.setDesc(TII->get(Opc));
+  MachineInstrBuilder(*BB->getParent(), &MI)
+      .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
   return BB;
 }
 
@@ -30432,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
   SDValue N0 = BitCast.getOperand(0);
   EVT VecVT = N0->getValueType(0);
 
-  if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
-      N0->getOpcode() == ISD::OR) {
-    SDValue Op0 = N0->getOperand(0);
-    SDValue Op1 = N0->getOperand(1);
-    MVT TrunckVT;
-    MVT BitcastVT;
-    switch (VT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v16i1:
-      TrunckVT = MVT::i8;
-      BitcastVT = MVT::v8i1;
-      break;
-    case MVT::v32i1:
-      TrunckVT = MVT::i16;
-      BitcastVT = MVT::v16i1;
-      break;
-    case MVT::v64i1:
-      TrunckVT = MVT::i32;
-      BitcastVT = MVT::v32i1;
-      break;
-    }
-    bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
-    bool isArg0UndefLeft =
-        Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
-    bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
-    bool isArg1UndefLeft =
-        Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
-    SDValue OpLeft;
-    SDValue OpRight;
-    if (isArg0UndefRight && isArg1UndefLeft) {
-      OpLeft = Op0;
-      OpRight = Op1;
-    } else if (isArg1UndefRight && isArg0UndefLeft) {
-      OpLeft = Op1;
-      OpRight = Op0;
-    } else
-      return SDValue();
-    SDLoc DL(BitCast);
-    SDValue Shr = OpLeft->getOperand(0);
-    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
-    SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
-    SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
-    SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
-  }
-
   if (!VT.isScalarInteger() || !VecVT.isSimple())
     return SDValue();
 
@@ -35533,7 +35520,7 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
   // If we're negating an FMA node, then we can adjust the
   // instruction to include the extra negation.
   unsigned NewOpcode = 0;
-  if (Arg.hasOneUse()) {
+  if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
     switch (Arg.getOpcode()) {
     case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 0782d5598746..fae0889950b2 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
   FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
   ROUNDP, ROUNDS
 };
@@ -479,6 +479,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
   X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+  X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index 223fa5771498..d03826bbe992 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -43,7 +43,7 @@ static const char R11ThunkName[]    = "__llvm_retpoline_r11";
 static const char EAXThunkName[]    = "__llvm_retpoline_eax";
 static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
 static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char PushThunkName[]   = "__llvm_retpoline_push";
+static const char EDIThunkName[]    = "__llvm_retpoline_edi";
 
 namespace {
 class X86RetpolineThunks : public MachineFunctionPass {
@@ -74,7 +74,6 @@ class X86RetpolineThunks : public MachineFunctionPass {
 
   void createThunkFunction(Module &M, StringRef Name);
   void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
   void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
 };
 
@@ -127,7 +126,7 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
       createThunkFunction(M, R11ThunkName);
     else
       for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
+           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
         createThunkFunction(M, Name);
     InsertedThunks = true;
     return true;
@@ -151,9 +150,8 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     populateThunk(MF, X86::R11);
   } else {
     // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that is used when
-    // there are no scratch registers and assumes the retpoline target has
-    // been pushed.
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
     //   __llvm_retpoline_eax:
     //         calll .Leax_call_target
     //   .Leax_capture_spec:
@@ -174,32 +172,18 @@ bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
     //         movl %edx, (%esp)
     //         retl
     //
-    // This last one is a bit more special and so needs a little extra
-    // handling.
-    // __llvm_retpoline_push:
-    //         calll .Lpush_call_target
-    // .Lpush_capture_spec:
-    //         pause
-    //         lfence
-    //         jmp .Lpush_capture_spec
-    // .align 16
-    // .Lpush_call_target:
-    //         # Clear pause_loop return address.
-    //         addl $4, %esp
-    //         # Top of stack words are: Callee, RA. Exchange Callee and RA.
-    //         pushl 4(%esp)  # Push callee
-    //         pushl 4(%esp)  # Push RA
-    //         popl 8(%esp)   # Pop RA to final RA
-    //         popl (%esp)    # Pop callee to next top of stack
-    //         retl           # Ret to callee
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
     if (MF.getName() == EAXThunkName)
       populateThunk(MF, X86::EAX);
     else if (MF.getName() == ECXThunkName)
       populateThunk(MF, X86::ECX);
     else if (MF.getName() == EDXThunkName)
       populateThunk(MF, X86::EDX);
-    else if (MF.getName() == PushThunkName)
-      populateThunk(MF);
+    else if (MF.getName() == EDIThunkName)
+      populateThunk(MF, X86::EDI);
     else
       llvm_unreachable("Invalid thunk name on x86-32!");
   }
@@ -240,31 +224,6 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
       .addReg(Reg);
 }
 
-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
-    MachineBasicBlock &MBB) {
-  // The instruction sequence we use to replace the return address without
-  // a scratch register is somewhat complicated:
-  //   # Clear capture_spec from return address.
-  //   addl $4, %esp
-  //   # Top of stack words are: Callee, RA. Exchange Callee and RA.
-  //   pushl 4(%esp)  # Push callee
-  //   pushl 4(%esp)  # Push RA
-  //   popl 8(%esp)   # Pop RA to final RA
-  //   popl (%esp)    # Pop callee to next top of stack
-  //   retl           # Ret to callee
-  BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
-      .addReg(X86::ESP)
-      .addImm(4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
-               false, 4);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 8);
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
-               false, 0);
-}
-
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
                                        Optional<unsigned> Reg) {
   // Set MF properties. We never use vregs...
@@ -301,11 +260,6 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
   CaptureSpec->addSuccessor(CaptureSpec);
 
   CallTarget->setAlignment(4);
-  if (Reg) {
-    insertRegReturnAddrClobber(*CallTarget, *Reg);
-  } else {
-    assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
-    insert32BitPushReturnAddrClobber(*CallTarget);
-  }
+  insertRegReturnAddrClobber(*CallTarget, *Reg);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 6f26f7f5cd19..c790de3505f3 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1643,11 +1643,25 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
+  auto canMergeSelectThroughBinop = [](BinaryOperator *BO) {
+    // The select might be preventing a division by 0.
+    switch (BO->getOpcode()) {
+    default:
+      return true;
+    case Instruction::SRem:
+    case Instruction::URem:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+      return false;
+    }
+  };
+
   // Try to simplify a binop sandwiched between 2 selects with the same
   // condition.
   // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z)
   BinaryOperator *TrueBO;
-  if (match(TrueVal, m_OneUse(m_BinOp(TrueBO)))) {
+  if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) &&
+      canMergeSelectThroughBinop(TrueBO)) {
     if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
       if (TrueBOSI->getCondition() == CondVal) {
         TrueBO->setOperand(0, TrueBOSI->getTrueValue());
@@ -1666,7 +1680,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W))
   BinaryOperator *FalseBO;
-  if (match(FalseVal, m_OneUse(m_BinOp(FalseBO)))) {
+  if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) &&
+      canMergeSelectThroughBinop(FalseBO)) {
     if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
       if (FalseBOSI->getCondition() == CondVal) {
         FalseBO->setOperand(0, FalseBOSI->getFalseValue());
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 9fc204f418ec..1564537078e4 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -859,10 +859,10 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
   BasicBlock *BB = PN->getParent();
   if (!BB->canSplitPredecessors())
     return false;
-  // FIXME: it's not impossible to split LandingPad blocks, but if BlockColors
-  // already exist it require updating BlockColors for all offspring blocks
-  // accordingly. By skipping such corner case, we can make updating BlockColors
-  // after splitting predecessor fairly simple.
+  // It's not impossible to split EHPad blocks, but if BlockColors already exist
+  // it require updating BlockColors for all offspring blocks accordingly. By
+  // skipping such corner case, we can make updating BlockColors after splitting
+  // predecessor fairly simple.
   if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad())
     return false;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -1198,9 +1198,9 @@ bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
   if (isa<AllocaInst>(Object))
     // Since the alloca goes out of scope, we know the caller can't retain a
     // reference to it and be well defined.  Thus, we don't need to check for
-    // capture. 
+    // capture.
     return true;
-  
+
   // For all other objects we need to know that the caller can't possibly
   // have gotten a reference to the object.  There are two components of
   // that:
@@ -1294,7 +1294,7 @@ bool llvm::promoteLoopAccessesToScalars(
     // That said, we can't actually make the unwind edge explicit. Therefore,
     // we have to prove that the store is dead along the unwind edge.  We do
     // this by proving that the caller can't have a reference to the object
-    // after return and thus can't possibly load from the object.  
+    // after return and thus can't possibly load from the object.
     Value *Object = GetUnderlyingObject(SomePtr, MDL);
     if (!isKnownNonEscaping(Object, TLI))
       return false;
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index a5a305ef582b..0a357f4b5004 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -77,10 +79,13 @@ bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurrenceKind Kind) {
   return false;
 }
 
-Instruction *
-RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
-                                     SmallPtrSetImpl<Instruction *> &Visited,
-                                     SmallPtrSetImpl<Instruction *> &CI) {
+/// Determines if Phi may have been type-promoted. If Phi has a single user
+/// that ANDs the Phi with a type mask, return the user. RT is updated to
+/// account for the narrower bit width represented by the mask, and the AND
+/// instruction is added to CI.
+static Instruction *lookThroughAnd(PHINode *Phi, Type *&RT,
+                                   SmallPtrSetImpl<Instruction *> &Visited,
+                                   SmallPtrSetImpl<Instruction *> &CI) {
   if (!Phi->hasOneUse())
     return Phi;
 
@@ -101,70 +106,92 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
   return Phi;
 }
 
-bool RecurrenceDescriptor::getSourceExtensionKind(
-    Instruction *Start, Instruction *Exit, Type *RT, bool &IsSigned,
-    SmallPtrSetImpl<Instruction *> &Visited,
-    SmallPtrSetImpl<Instruction *> &CI) {
+/// Compute the minimal bit width needed to represent a reduction whose exit
+/// instruction is given by Exit.
+static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
+                                                     DemandedBits *DB,
+                                                     AssumptionCache *AC,
+                                                     DominatorTree *DT) {
+  bool IsSigned = false;
+  const DataLayout &DL = Exit->getModule()->getDataLayout();
+  uint64_t MaxBitWidth = DL.getTypeSizeInBits(Exit->getType());
+
+  if (DB) {
+    // Use the demanded bits analysis to determine the bits that are live out
+    // of the exit instruction, rounding up to the nearest power of two. If the
+    // use of demanded bits results in a smaller bit width, we know the value
+    // must be positive (i.e., IsSigned = false), because if this were not the
+    // case, the sign bit would have been demanded.
+    auto Mask = DB->getDemandedBits(Exit);
+    MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
+  }
+
+  if (MaxBitWidth == DL.getTypeSizeInBits(Exit->getType()) && AC && DT) {
+    // If demanded bits wasn't able to limit the bit width, we can try to use
+    // value tracking instead. This can be the case, for example, if the value
+    // may be negative.
+    auto NumSignBits = ComputeNumSignBits(Exit, DL, 0, AC, nullptr, DT);
+    auto NumTypeBits = DL.getTypeSizeInBits(Exit->getType());
+    MaxBitWidth = NumTypeBits - NumSignBits;
+    KnownBits Bits = computeKnownBits(Exit, DL);
+    if (!Bits.isNonNegative()) {
+      // If the value is not known to be non-negative, we set IsSigned to true,
+      // meaning that we will use sext instructions instead of zext
+      // instructions to restore the original type.
+      IsSigned = true;
+      if (!Bits.isNegative())
+        // If the value is not known to be negative, we don't known what the
+        // upper bit is, and therefore, we don't know what kind of extend we
+        // will need. In this case, just increase the bit width by one bit and
+        // use sext.
+        ++MaxBitWidth;
+    }
+  }
+  if (!isPowerOf2_64(MaxBitWidth))
+    MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+  return std::make_pair(Type::getIntNTy(Exit->getContext(), MaxBitWidth),
+                        IsSigned);
+}
+
+/// Collect cast instructions that can be ignored in the vectorizer's cost
+/// model, given a reduction exit value and the minimal type in which the
+/// reduction can be represented.
+static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
+                                 Type *RecurrenceType,
+                                 SmallPtrSetImpl<Instruction *> &Casts) {
 
   SmallVector<Instruction *, 8> Worklist;
-  bool FoundOneOperand = false;
-  unsigned DstSize = RT->getPrimitiveSizeInBits();
+  SmallPtrSet<Instruction *, 8> Visited;
   Worklist.push_back(Exit);
 
-  // Traverse the instructions in the reduction expression, beginning with the
-  // exit value.
   while (!Worklist.empty()) {
-    Instruction *I = Worklist.pop_back_val();
-    for (Use &U : I->operands()) {
-
-      // Terminate the traversal if the operand is not an instruction, or we
-      // reach the starting value.
-      Instruction *J = dyn_cast<Instruction>(U.get());
-      if (!J || J == Start)
-        continue;
-
-      // Otherwise, investigate the operation if it is also in the expression.
-      if (Visited.count(J)) {
-        Worklist.push_back(J);
+    Instruction *Val = Worklist.pop_back_val();
+    Visited.insert(Val);
+    if (auto *Cast = dyn_cast<CastInst>(Val))
+      if (Cast->getSrcTy() == RecurrenceType) {
+        // If the source type of a cast instruction is equal to the recurrence
+        // type, it will be eliminated, and should be ignored in the vectorizer
+        // cost model.
+        Casts.insert(Cast);
         continue;
       }
 
-      // If the operand is not in Visited, it is not a reduction operation, but
-      // it does feed into one. Make sure it is either a single-use sign- or
-      // zero-extend instruction.
-      CastInst *Cast = dyn_cast<CastInst>(J);
-      bool IsSExtInst = isa<SExtInst>(J);
-      if (!Cast || !Cast->hasOneUse() || !(isa<ZExtInst>(J) || IsSExtInst))
-        return false;
-
-      // Ensure the source type of the extend is no larger than the reduction
-      // type. It is not necessary for the types to be identical.
-      unsigned SrcSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
-      if (SrcSize > DstSize)
-        return false;
-
-      // Furthermore, ensure that all such extends are of the same kind.
-      if (FoundOneOperand) {
-        if (IsSigned != IsSExtInst)
-          return false;
-      } else {
-        FoundOneOperand = true;
-        IsSigned = IsSExtInst;
-      }
-
-      // Lastly, if the source type of the extend matches the reduction type,
-      // add the extend to CI so that we can avoid accounting for it in the
-      // cost model.
-      if (SrcSize == DstSize)
-        CI.insert(Cast);
-    }
+    // Add all operands to the work list if they are loop-varying values that
+    // we haven't yet visited.
+    for (Value *O : cast<User>(Val)->operands())
+      if (auto *I = dyn_cast<Instruction>(O))
+        if (TheLoop->contains(I) && !Visited.count(I))
+          Worklist.push_back(I);
   }
-  return true;
 }
 
 bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
                                            Loop *TheLoop, bool HasFunNoNaNAttr,
-                                           RecurrenceDescriptor &RedDes) {
+                                           RecurrenceDescriptor &RedDes,
+                                           DemandedBits *DB,
+                                           AssumptionCache *AC,
+                                           DominatorTree *DT) {
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
@@ -353,14 +380,49 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
-  // If we think Phi may have been type-promoted, we also need to ensure that
-  // all source operands of the reduction are either SExtInsts or ZEstInsts. If
-  // so, we will be able to evaluate the reduction in the narrower bit width.
-  if (Start != Phi)
-    if (!getSourceExtensionKind(Start, ExitInstruction, RecurrenceType,
-                                IsSigned, VisitedInsts, CastInsts))
+  if (Start != Phi) {
+    // If the starting value is not the same as the phi node, we speculatively
+    // looked through an 'and' instruction when evaluating a potential
+    // arithmetic reduction to determine if it may have been type-promoted.
+    //
+    // We now compute the minimal bit width that is required to represent the
+    // reduction. If this is the same width that was indicated by the 'and', we
+    // can represent the reduction in the smaller type. The 'and' instruction
+    // will be eliminated since it will essentially be a cast instruction that
+    // can be ignore in the cost model. If we compute a different type than we
+    // did when evaluating the 'and', the 'and' will not be eliminated, and we
+    // will end up with different kinds of operations in the recurrence
+    // expression (e.g., RK_IntegerAND, RK_IntegerADD). We give up if this is
+    // the case.
+    //
+    // The vectorizer relies on InstCombine to perform the actual
+    // type-shrinking. It does this by inserting instructions to truncate the
+    // exit value of the reduction to the width indicated by RecurrenceType and
+    // then extend this value back to the original width. If IsSigned is false,
+    // a 'zext' instruction will be generated; otherwise, a 'sext' will be
+    // used.
+    //
+    // TODO: We should not rely on InstCombine to rewrite the reduction in the
+    //       smaller type. We should just generate a correctly typed expression
+    //       to begin with.
+    Type *ComputedType;
+    std::tie(ComputedType, IsSigned) =
+        computeRecurrenceType(ExitInstruction, DB, AC, DT);
+    if (ComputedType != RecurrenceType)
       return false;
 
+    // The recurrence expression will be represented in a narrower type. If
+    // there are any cast instructions that will be unnecessary, collect them
+    // in CastInsts. Note that the 'and' instruction was already included in
+    // this list.
+    //
+    // TODO: A better way to represent this may be to tag in some way all the
+    //       instructions that are a part of the reduction. The vectorizer cost
+    //       model could then apply the recurrence type to these instructions,
+    //       without needing a white list of instructions to ignore.
+    collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
+  }
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -480,47 +542,57 @@ bool RecurrenceDescriptor::hasMultipleUsesOf(
   return false;
 }
 bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
-                                          RecurrenceDescriptor &RedDes) {
+                                          RecurrenceDescriptor &RedDes,
+                                          DemandedBits *DB, AssumptionCache *AC,
+                                          DominatorTree *DT) {
 
   BasicBlock *Header = TheLoop->getHeader();
   Function &F = *Header->getParent();
   bool HasFunNoNaNAttr =
       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 
-  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an ADD reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found a MUL reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerOr, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an OR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerAnd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an AND reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerXor, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr,
-                      RedDes)) {
+  if (AddReductionVar(Phi, RK_IntegerMinMax, TheLoop, HasFunNoNaNAttr, RedDes,
+                      DB, AC, DT)) {
     DEBUG(dbgs() << "Found a MINMAX reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_FloatMult, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an FMult reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_FloatAdd, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes)) {
+  if (AddReductionVar(Phi, RK_FloatMinMax, TheLoop, HasFunNoNaNAttr, RedDes, DB,
+                      AC, DT)) {
     DEBUG(dbgs() << "Found an float MINMAX reduction PHI." << *Phi << "\n");
     return true;
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 64f206ea92eb..5bcf0c0a7ba6 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1542,9 +1542,10 @@ class LoopVectorizationLegality {
       const TargetTransformInfo *TTI,
       std::function<const LoopAccessInfo &(Loop &)> *GetLAA, LoopInfo *LI,
       OptimizationRemarkEmitter *ORE, LoopVectorizationRequirements *R,
-      LoopVectorizeHints *H)
+      LoopVectorizeHints *H, DemandedBits *DB, AssumptionCache *AC)
       : TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT), GetLAA(GetLAA),
-        ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H) {}
+        ORE(ORE), InterleaveInfo(PSE, L, DT, LI), Requirements(R), Hints(H),
+        DB(DB), AC(AC) {}
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
@@ -1833,6 +1834,14 @@ class LoopVectorizationLegality {
   /// Used to emit an analysis of any legality issues.
   LoopVectorizeHints *Hints;
 
+  /// The demanded bits analsyis is used to compute the minimum type size in
+  /// which a reduction can be computed.
+  DemandedBits *DB;
+
+  /// The assumption cache analysis is used to compute the minimum type size in
+  /// which a reduction can be computed.
+  AssumptionCache *AC;
+
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
@@ -5300,7 +5309,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         }
 
         RecurrenceDescriptor RedDes;
-        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes)) {
+        if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+                                                 DT)) {
           if (RedDes.hasUnsafeAlgebra())
             Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
           AllowedExit.insert(RedDes.getLoopExitInstr());
@@ -8514,7 +8524,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Check if it is legal to vectorize the loop.
   LoopVectorizationRequirements Requirements(*ORE);
   LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
-                                &Requirements, &Hints);
+                                &Requirements, &Hints, DB, AC);
   if (!LVL.canVectorize()) {
     DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
     emitMissedWarning(F, L, Hints, ORE);
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index 6c47a853e24a..e9313dfba870 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -612,9 +612,7 @@ define void @fastmathflags(float %op1, float %op2) {
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 55844e5c4986..82fc99055357 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -656,9 +656,7 @@ define void @fastmathflags(float %op1, float %op2) {
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index a7fa20f2bc08..2e70a380d10e 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -687,9 +687,7 @@ define void @fastmathflags(float %op1, float %op2) {
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
@@ -702,9 +700,7 @@ declare <4 x double> @fmf3()
 ; CHECK-LABEL: fastMathFlagsForCalls(
 define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
   %call.fast = call fast float @fmf1()
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'aml' bits set, so this is not fully 'fast'.
-  ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
+  ; CHECK: %call.fast = call fast float @fmf1()
 
   ; Throw in some other attributes to make sure those stay in the right places.
 
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index c456fefe9d40..7c84daa7d3c4 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -758,9 +758,7 @@ define void @fastmathflags(float %op1, float %op2) {
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
@@ -773,9 +771,7 @@ declare <4 x double> @fmf3()
 ; CHECK-LABEL: fastMathFlagsForCalls(
 define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
   %call.fast = call fast float @fmf1()
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
+  ; CHECK: %call.fast = call fast float @fmf1()
 
   ; Throw in some other attributes to make sure those stay in the right places.
 
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
index 68446a7d5b0a..9e34d48c95f7 100644
--- a/test/Bitcode/compatibility-4.0.ll
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -757,10 +757,8 @@ define void @fastmathflags(float %op1, float %op2) {
   ; CHECK: %f.nsz = fadd nsz float %op1, %op2
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
   %f.fast = fadd fast float %op1, %op2
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
@@ -773,9 +771,7 @@ declare <4 x double> @fmf3()
 ; CHECK-LABEL: fastMathFlagsForCalls(
 define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
   %call.fast = call fast float @fmf1()
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'contract' and 'afn' bits set, so this is not fully 'fast'.
-  ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp float @fmf1()
+  ; CHECK: %call.fast = call fast float @fmf1()
 
   ; Throw in some other attributes to make sure those stay in the right places.
 
diff --git a/test/Bitcode/compatibility-5.0.ll b/test/Bitcode/compatibility-5.0.ll
index cdadc032d87b..a4b3fca82b7b 100644
--- a/test/Bitcode/compatibility-5.0.ll
+++ b/test/Bitcode/compatibility-5.0.ll
@@ -765,9 +765,7 @@ define void @fastmathflags(float %op1, float %op2) {
   %f.contract = fadd contract float %op1, %op2
   ; CHECK: %f.contract = fadd contract float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
-  ; CHECK: %f.fast = fadd reassoc nnan ninf nsz arcp contract float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
 }
 
@@ -780,9 +778,7 @@ declare <4 x double> @fmf3()
 ; CHECK-LABEL: fastMathFlagsForCalls(
 define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
   %call.fast = call fast float @fmf1()
-  ; 'fast' used to be its own bit, but this changed in Oct 2017.
-  ; The binary test file does not have the newer 'afn' bit set, so this is not fully 'fast'.
-  ; CHECK: %call.fast = call reassoc nnan ninf nsz arcp contract float @fmf1()
+  ; CHECK: %call.fast = call fast float @fmf1()
 
   ; Throw in some other attributes to make sure those stay in the right places.
 
diff --git a/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir b/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir
new file mode 100644
index 000000000000..fd1998037d38
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/fp16-copy-gpr.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -o - -global-isel -verify-machineinstrs -run-pass=instruction-select %s | FileCheck %s
+
+# PR36345
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-arm-none-eabi"
+
+  ; Function Attrs: noinline nounwind optnone
+  define void @fp16_to_gpr([2 x half], [2 x half]* %addr) {
+    ret void
+  }
+
+  define void @gpr_to_fp16() {
+    ret void
+  }
+
+  define void @gpr_to_fp16_physreg() {
+    ret void
+  }
+...
+---
+name:            fp16_to_gpr
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+  - { id: 7, class: gpr }
+  - { id: 8, class: gpr }
+  - { id: 9, class: gpr }
+  - { id: 10, class: gpr }
+  - { id: 11, class: gpr }
+  - { id: 12, class: gpr }
+body:             |
+  bb.1 (%ir-block.1):
+    liveins: %h0, %h1, %x0
+
+    ; CHECK-LABEL: name: fp16_to_gpr
+    ; CHECK: liveins: %h0, %h1, %x0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr16 = COPY %h0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY %h1
+    ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
+    ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[SUBREG_TO_REG]]
+    ; CHECK: [[BFMWri:%[0-9]+]]:gpr32 = BFMWri [[DEF]], [[COPY2]], 0, 15
+    ; CHECK: [[SUBREG_TO_REG1:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr32 = COPY [[SUBREG_TO_REG1]]
+    ; CHECK: [[BFMWri1:%[0-9]+]]:gpr32 = BFMWri [[BFMWri]], [[COPY3]], 16, 15
+    ; CHECK: [[COPY4:%[0-9]+]]:gpr32 = COPY [[BFMWri1]]
+    ; CHECK: [[COPY5:%[0-9]+]]:gpr64sp = COPY %x0
+    ; CHECK: STRWui [[COPY4]], [[COPY5]], 0 :: (store 4 into %ir.addr, align 2)
+    ; CHECK: RET_ReallyLR
+    %1:fpr(s16) = COPY %h0
+    %2:fpr(s16) = COPY %h1
+    %3:gpr(s32) = G_IMPLICIT_DEF
+    %11:gpr(s16) = COPY %1(s16)
+    %4:gpr(s32) = G_INSERT %3, %11(s16), 0
+    %12:gpr(s16) = COPY %2(s16)
+    %5:gpr(s32) = G_INSERT %4, %12(s16), 16
+    %0:gpr(s32) = COPY %5(s32)
+    %6:gpr(p0) = COPY %x0
+    G_STORE %0(s32), %6(p0) :: (store 4 into %ir.addr, align 2)
+    RET_ReallyLR
+
+...
+
+---
+name:            gpr_to_fp16
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: fpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %w0
+
+    ; CHECK-LABEL: name: gpr_to_fp16
+    ; CHECK: liveins: %w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr16 = COPY [[COPY2]].hsub
+    ; CHECK: [[COPY4:%[0-9]+]]:fpr16 = COPY [[COPY3]]
+    ; CHECK: %h0 = COPY [[COPY4]]
+    ; CHECK: RET_ReallyLR implicit %h0
+    %0:gpr(s32) = COPY %w0
+    %1:gpr(s16) = G_TRUNC %0(s32)
+    %2:fpr(s16) = COPY %1(s16)
+    %h0 = COPY %2(s16)
+    RET_ReallyLR implicit %h0
+
+...
+---
+name:            gpr_to_fp16_physreg
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %w0
+
+    ; CHECK-LABEL: name: gpr_to_fp16_physreg
+    ; CHECK: liveins: %w0
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr16 = COPY [[COPY2]].hsub
+    ; CHECK: %h0 = COPY [[COPY3]]
+    ; CHECK: RET_ReallyLR implicit %h0
+    %0:gpr(s32) = COPY %w0
+    %1:gpr(s16) = G_TRUNC %0(s32)
+    %h0 = COPY %1(s16)
+    RET_ReallyLR implicit %h0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
index 33b483511065..1980048eb456 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-insert-extract.mir
@@ -1,8 +1,8 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
 
 ---
-# CHECK-LABEL: name: insert_gprs
-name:            insert_gprs
+name:            insert_gprx
 legalized:       true
 regBankSelected: true
 
@@ -10,26 +10,56 @@ body:             |
   bb.0:
     liveins: %x0
 
+    ; CHECK-LABEL: name: insert_gprx
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
+    ; CHECK: [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF
+    ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
+    ; CHECK: [[BFMXri:%[0-9]+]]:gpr64 = BFMXri [[DEF]], [[SUBREG_TO_REG]], 0, 31
+    ; CHECK: [[SUBREG_TO_REG1:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.sub_32
+    ; CHECK: [[BFMXri1:%[0-9]+]]:gpr64 = BFMXri [[DEF]], [[SUBREG_TO_REG1]], 51, 31
+    ; CHECK: %x0 = COPY [[BFMXri]]
+    ; CHECK: %x1 = COPY [[BFMXri1]]
     %0:gpr(s32) = COPY %w0
 
     %1:gpr(s64) = G_IMPLICIT_DEF
 
-    ; CHECK:  body:
-    ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
-    ; CHECK: %2:gpr64 = BFMXri %1, [[TMP]], 0, 31
     %2:gpr(s64) = G_INSERT %1, %0, 0
 
-    ; CHECK: [[TMP:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
-    ; CHECK: %3:gpr64 = BFMXri %1, [[TMP]], 51, 31
     %3:gpr(s64) = G_INSERT %1, %0, 13
 
     %x0 = COPY %2
     %x1 = COPY %3
 ...
 
+---
+name:            insert_gprw
+legalized:       true
+regBankSelected: true
+
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+    ; CHECK-LABEL: name: insert_gprw
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[COPY]]
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[COPY]]
+    ; CHECK: [[DEF:%[0-9]+]]:gpr32 = IMPLICIT_DEF
+    ; CHECK: [[BFMWri:%[0-9]+]]:gpr32 = BFMWri [[DEF]], [[COPY1]], 0, 15
+    ; CHECK: [[BFMWri1:%[0-9]+]]:gpr32 = BFMWri [[BFMWri]], [[COPY2]], 16, 15
+    ; CHECK: [[COPY3:%[0-9]+]]:gpr32all = COPY [[BFMWri1]]
+    ; CHECK: %w0 = COPY [[COPY3]]
+    %1:gpr(s32) = COPY %w0
+    %2:gpr(s32) = COPY %w1
+    %3:gpr(s16) = G_TRUNC %1(s32)
+    %4:gpr(s16) = G_TRUNC %1(s32)
+    %5:gpr(s32) = G_IMPLICIT_DEF
+    %6:gpr(s32) = G_INSERT %5, %3(s16), 0
+    %7:gpr(s32) = G_INSERT %6, %4(s16), 16
+    %0:gpr(s32) = COPY %7(s32)
+    %w0 = COPY %0
+...
 
 ---
-# CHECK-LABEL: name: extract_gprs
 name:            extract_gprs
 legalized:       true
 regBankSelected: true
@@ -38,17 +68,49 @@ body:             |
   bb.0:
     liveins: %x0
 
+    ; CHECK-LABEL: name: extract_gprs
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY %x0
+    ; CHECK: [[UBFMXri:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 0, 31
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY [[UBFMXri]].sub_32
+    ; CHECK: [[UBFMXri1:%[0-9]+]]:gpr64 = UBFMXri [[COPY]], 13, 44
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[UBFMXri1]].sub_32
+    ; CHECK: %w0 = COPY [[COPY1]]
+    ; CHECK: %w1 = COPY [[COPY2]]
     %0:gpr(s64) = COPY %x0
 
-    ; CHECK:  body:
-    ; CHECK: [[TMP:%[0-9]+]]:gpr64 = UBFMXri %0, 0, 31
-    ; CHECK: %1:gpr32 = COPY [[TMP]].sub_32
     %1:gpr(s32) = G_EXTRACT %0, 0
 
-    ; CHECK: [[TMP:%[0-9]+]]:gpr64 = UBFMXri %0, 13, 44
-    ; CHECK: %2:gpr32 = COPY [[TMP]].sub_32
     %2:gpr(s32) = G_EXTRACT %0, 13
 
     %w0 = COPY %1
     %w1 = COPY %2
 ...
+
+---
+name:            extract_gprw
+legalized:       true
+regBankSelected: true
+
+body:             |
+  bb.0:
+    liveins: %w0
+
+    ; CHECK-LABEL: name: extract_gprw
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY %w0
+    ; CHECK: [[UBFMWri:%[0-9]+]]:gpr32 = UBFMWri [[COPY]], 0, 15
+    ; CHECK: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri [[COPY]], 15, 30
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[UBFMWri]]
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr16 = COPY [[COPY1]].hsub
+    ; CHECK: %h0 = COPY [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]]
+    ; CHECK: [[COPY4:%[0-9]+]]:fpr16 = COPY [[COPY3]].hsub
+    ; CHECK: %h1 = COPY [[COPY4]]
+    %0:gpr(s32) = COPY %w0
+
+    %1:gpr(s16) = G_EXTRACT %0, 0
+
+    %2:gpr(s16) = G_EXTRACT %0, 15
+
+    %h0 = COPY %1
+    %h1 = COPY %2
+...
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 420c7b80b8d3..adf22323ae65 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -261,8 +261,42 @@ main_body:
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_sgpr_descriptor_promoted
+; GCN: v_readfirstlane
+define amdgpu_cs void @smrd_sgpr_descriptor_promoted([0 x i8] addrspace(2)* inreg noalias dereferenceable(18446744073709551615), i32) #0 {
+main_body:
+  %descptr = bitcast [0 x i8] addrspace(2)* %0 to <4 x i32> addrspace(2)*, !amdgpu.uniform !0
+  br label %.outer_loop_header
+
+ret_block:                                       ; preds = %.outer, %.label22, %main_body
+  ret void
+
+.outer_loop_header:
+  br label %.inner_loop_header
+
+.inner_loop_header:                                     ; preds = %.inner_loop_body, %.outer_loop_header
+  %loopctr.1 = phi i32 [ 0, %.outer_loop_header ], [ %loopctr.2, %.inner_loop_body ]
+  %loopctr.2 = add i32 %loopctr.1, 1
+  %inner_br1 = icmp slt i32 %loopctr.2, 10
+  br i1 %inner_br1, label %.inner_loop_body, label %ret_block
+
+.inner_loop_body:
+  %descriptor = load <4 x i32>, <4 x i32> addrspace(2)* %descptr, align 16, !invariant.load !0
+  %load1result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 0)
+  %inner_br2 = icmp uge i32 %1, 10
+  br i1 %inner_br2, label %.inner_loop_header, label %.outer_loop_body
+
+.outer_loop_body:
+  %offset = shl i32 %loopctr.2, 6
+  %load2result = call float @llvm.SI.load.const.v4i32(<4 x i32> %descriptor, i32 %offset)
+  %outer_br = fcmp ueq float %load2result, 0x0
+  br i1 %outer_br, label %.outer_loop_header, label %ret_block
+}
+
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+
+!0 = !{}
diff --git a/test/CodeGen/PowerPC/pr36292.ll b/test/CodeGen/PowerPC/pr36292.ll
new file mode 100644
index 000000000000..a171918b9e07
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr36292.ll
@@ -0,0 +1,46 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s  | \
+; RUN:   FileCheck %s --implicit-check-not=mtctr --implicit-check-not=bdnz
+$test = comdat any
+
+; No CTR loop due to frem (since it is always a call).
+define void @test() #0 comdat {
+; CHECK-LABEL: test:
+; CHECK:    ld 29, 0(3)
+; CHECK:    ld 30, 40(1)
+; CHECK:    xxlxor 31, 31, 31
+; CHECK:    cmpld 30, 29
+; CHECK-NEXT:    bge- 0, .LBB0_2
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB0_1: # %bounds.ok
+; CHECK:    fmr 1, 31
+; CHECK-NEXT:    lfsx 2, 0, 3
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 30, 30, 1
+; CHECK-NEXT:    stfsx 1, 0, 3
+; CHECK-NEXT:    cmpld 30, 29
+; CHECK-NEXT:    blt+ 0, .LBB0_1
+; CHECK-NEXT:  .LBB0_2: # %bounds.fail
+; CHECK-NEXT:    std 30, 40(1)
+  %pos = alloca i64, align 8
+  br label %forcond
+
+forcond:                                          ; preds = %bounds.ok, %0
+  %1 = load i64, i64* %pos
+  %.len1 = load i64, i64* undef
+  %bounds.cmp = icmp ult i64 %1, %.len1
+  br i1 %bounds.cmp, label %bounds.ok, label %bounds.fail
+
+bounds.ok:                                        ; preds = %forcond
+  %2 = load float, float* undef
+  %3 = frem float 0.000000e+00, %2
+  store float %3, float* undef
+  %4 = load i64, i64* %pos
+  %5 = add i64 %4, 1
+  store i64 %5, i64* %pos
+  br label %forcond
+
+bounds.fail:                                      ; preds = %forcond
+  unreachable
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 50de773af001..80127f66bdfe 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5,59 +5,6 @@
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
 
 
-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
-; X32-LABEL: test_mm512_kunpackb:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X32-NEXT:    vpcmpneqd 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckbw %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovw %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackb:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckbw %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__A to <16 x i32>
-  %1 = bitcast <8 x i64> %__B to <16 x i32>
-  %2 = icmp ne <16 x i32> %0, %1
-  %3 = bitcast <16 x i1> %2 to i16
-  %4 = bitcast <8 x i64> %__C to <16 x i32>
-  %5 = bitcast <8 x i64> %__D to <16 x i32>
-  %6 = icmp ne <16 x i32> %4, %5
-  %7 = bitcast <16 x i1> %6 to i16
-  %8 = and i16 %7, 255
-  %shl.i = shl i16 %3, 8
-  %or.i = or i16 %8, %shl.i
-  %9 = bitcast <8 x i64> %__E to <16 x i32>
-  %10 = bitcast <8 x i64> %__F to <16 x i32>
-  %11 = icmp ne <16 x i32> %9, %10
-  %12 = bitcast i16 %or.i to <16 x i1>
-  %13 = and <16 x i1> %11, %12
-  %14 = bitcast <16 x i1> %13 to i16
-  ret i16 %14
-}
-
 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
 ; X32-LABEL: test_mm512_shuffle_f32x4:
 ; X32:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index f3ca0644e463..378dbda2dc0a 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,20 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    shll $8, %esi
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
-; CHECK-NEXT:    retq
-  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
-  ret i16 %res
-}
-
 define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
 ; CHECK:       ## %bb.0:
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5faa202c30f3..628199d4ac9e 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
   ret i16 %t2
 }
 
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT:    retq
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
 declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
 ; TODO: the two kxnor instructions here a no op and should be elimintaed,
 ; probably by FoldConstantArithmetic in SelectionDAG.
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 4877157d911d..d112577a6104 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -2775,3 +2775,99 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
   %ret = bitcast <8 x i1> %m2 to i8
   ret i8 %ret
 }
+
+; Make sure we don't emit a ktest for signed comparisons.
+define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
+; KNL-LABEL: ktest_signed:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    pushq %rax
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testw %ax, %ax
+; KNL-NEXT:    jle LBB63_1
+; KNL-NEXT:  ## %bb.2: ## %bb.2
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB63_1: ## %bb.1
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    callq _foo
+; KNL-NEXT:    popq %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: ktest_signed:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    pushq %rax
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    testw %ax, %ax
+; SKX-NEXT:    jle LBB63_1
+; SKX-NEXT:  ## %bb.2: ## %bb.2
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB63_1: ## %bb.1
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    callq _foo
+; SKX-NEXT:    popq %rax
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: ktest_signed:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    pushq %rax
+; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
+; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    testw %ax, %ax
+; AVX512BW-NEXT:    jle LBB63_1
+; AVX512BW-NEXT:  ## %bb.2: ## %bb.2
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512BW-NEXT:  LBB63_1: ## %bb.1
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    callq _foo
+; AVX512BW-NEXT:    popq %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: ktest_signed:
+; AVX512DQ:       ## %bb.0:
+; AVX512DQ-NEXT:    pushq %rax
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    testw %ax, %ax
+; AVX512DQ-NEXT:    jle LBB63_1
+; AVX512DQ-NEXT:  ## %bb.2: ## %bb.2
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+; AVX512DQ-NEXT:  LBB63_1: ## %bb.1
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    callq _foo
+; AVX512DQ-NEXT:    popq %rax
+; AVX512DQ-NEXT:    retq
+  %a = icmp eq <16 x i32> %x, zeroinitializer
+  %b = icmp eq <16 x i32> %y, zeroinitializer
+  %c = and <16 x i1> %a, %b
+  %d = bitcast <16 x i1> %c to i16
+  %e = icmp sgt i16 %d, 0
+  br i1 %e, label %bb.2, label %bb.1
+bb.1:
+  call void @foo()
+  br label %bb.2
+bb.2:
+  ret void
+}
+declare void @foo()
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index 1e754be6fe49..a56111f3453e 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -4,117 +4,6 @@
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
 
-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT:    vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    kunpckdq %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckdq %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovq %k0, %rax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <64 x i8>
-  %1 = bitcast <8 x i64> %__A to <64 x i8>
-  %2 = icmp ne <64 x i8> %0, %1
-  %3 = bitcast <64 x i1> %2 to i64
-  %4 = bitcast <8 x i64> %__C to <64 x i8>
-  %5 = bitcast <8 x i64> %__D to <64 x i8>
-  %6 = icmp ne <64 x i8> %4, %5
-  %7 = bitcast <64 x i1> %6 to i64
-  %and.i = and i64 %7, 4294967295
-  %shl.i = shl i64 %3, 32
-  %or.i = or i64 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <64 x i8>
-  %9 = bitcast <8 x i64> %__F to <64 x i8>
-  %10 = icmp ne <64 x i8> %8, %9
-  %11 = bitcast i64 %or.i to <64 x i1>
-  %12 = and <64 x i1> %10, %11
-  %13 = bitcast <64 x i1> %12 to i64
-  ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckwd %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT:    vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT:    kunpckwd %k0, %k1, %k1
-; X64-NEXT:    vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
-entry:
-  %0 = bitcast <8 x i64> %__B to <32 x i16>
-  %1 = bitcast <8 x i64> %__A to <32 x i16>
-  %2 = icmp ne <32 x i16> %0, %1
-  %3 = bitcast <32 x i1> %2 to i32
-  %4 = bitcast <8 x i64> %__C to <32 x i16>
-  %5 = bitcast <8 x i64> %__D to <32 x i16>
-  %6 = icmp ne <32 x i16> %4, %5
-  %7 = bitcast <32 x i1> %6 to i32
-  %and.i = and i32 %7, 65535
-  %shl.i = shl i32 %3, 16
-  %or.i = or i32 %and.i, %shl.i
-  %8 = bitcast <8 x i64> %__E to <32 x i16>
-  %9 = bitcast <8 x i64> %__F to <32 x i16>
-  %10 = icmp ne <32 x i16> %8, %9
-  %11 = bitcast i32 %or.i to <32 x i1>
-  %12 = and <32 x i1> %10, %11
-  %13 = bitcast <32 x i1> %12 to i32
-  ret i32 %13
-}
-
-
 define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
 ; X32-LABEL: test_mm512_mask_set1_epi8:
 ; X32:       # %bb.0: # %entry
@@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
@@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $55, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $9, %k0, %k1
 ; X32-NEXT:    andb $2, %al
 ; X32-NEXT:    shrb %al
 ; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $54, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $10, %k0, %k1
 ; X32-NEXT:    movb %ch, %al
 ; X32-NEXT:    andb $15, %al
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    kshiftlq $63, %k1, %k1
-; X32-NEXT:    kshiftrq $53, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k0
-; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kmovd %edx, %k3
 ; X32-NEXT:    shrb $3, %al
-; X32-NEXT:    kmovd %eax, %k2
-; X32-NEXT:    kxorq %k2, %k1, %k1
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $12, %eax
-; X32-NEXT:    andl $15, %eax
-; X32-NEXT:    kmovd %eax, %k2
+; X32-NEXT:    kmovd %eax, %k4
 ; X32-NEXT:    movl %ecx, %eax
 ; X32-NEXT:    shrl $13, %eax
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    kmovd %eax, %k3
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $14, %eax
-; X32-NEXT:    andl $3, %eax
-; X32-NEXT:    kmovd %eax, %k4
-; X32-NEXT:    movl %ecx, %eax
-; X32-NEXT:    shrl $15, %eax
-; X32-NEXT:    andl $1, %eax
 ; X32-NEXT:    kmovd %eax, %k5
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrl $16, %edx
@@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kmovd %eax, %k7
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $55, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $9, %k0, %k1
+; X32-NEXT:    kxorq %k2, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $54, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $10, %k0, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
+; X32-NEXT:    kshiftrq $53, %k1, %k1
+; X32-NEXT:    kxorq %k0, %k1, %k0
+; X32-NEXT:    kshiftrq $11, %k0, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $52, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $12, %k0, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $51, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $13, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k5, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $50, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $14, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $49, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $15, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %ecx, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $48, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $43, %k0, %k1
 ; X32-NEXT:    kxorq %k4, %k1, %k1
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $12, %esi
-; X32-NEXT:    andl $15, %esi
-; X32-NEXT:    kmovd %esi, %k2
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $14, %esi
-; X32-NEXT:    andl $3, %esi
-; X32-NEXT:    kmovd %esi, %k3
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    shrl $15, %esi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    kmovd %esi, %k4
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $20, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $44, %k0, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $12, %esi
+; X32-NEXT:    andl $15, %esi
+; X32-NEXT:    kmovd %esi, %k2
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $19, %k1, %k1
@@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $18, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $46, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $14, %esi
+; X32-NEXT:    andl $3, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $17, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $47, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    shrl $15, %esi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    kmovd %esi, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $16, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $12, %k1, %k1
-; X32-NEXT:    kxorq %k0, %k1, %k4
-; X32-NEXT:    kshiftrq $52, %k4, %k0
+; X32-NEXT:    kxorq %k0, %k1, %k3
+; X32-NEXT:    kshiftrq $52, %k3, %k0
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $4, %dl
 ; X32-NEXT:    kmovd %edx, %k1
@@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    andb $15, %cl
 ; X32-NEXT:    movl %ecx, %edx
 ; X32-NEXT:    shrb $2, %dl
-; X32-NEXT:    kmovd %edx, %k3
+; X32-NEXT:    kmovd %edx, %k4
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $11, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k4
-; X32-NEXT:    kshiftrq $53, %k4, %k5
+; X32-NEXT:    kxorq %k3, %k5, %k3
+; X32-NEXT:    kshiftrq $53, %k3, %k5
 ; X32-NEXT:    kxorq %k6, %k5, %k5
 ; X32-NEXT:    kshiftlq $63, %k5, %k5
 ; X32-NEXT:    kshiftrq $10, %k5, %k5
-; X32-NEXT:    kxorq %k4, %k5, %k5
-; X32-NEXT:    kshiftrq $54, %k5, %k4
-; X32-NEXT:    kxorq %k7, %k4, %k6
+; X32-NEXT:    kxorq %k3, %k5, %k5
+; X32-NEXT:    kshiftrq $54, %k5, %k3
+; X32-NEXT:    kxorq %k7, %k3, %k6
 ; X32-NEXT:    shrb $3, %cl
-; X32-NEXT:    kmovd %ecx, %k4
+; X32-NEXT:    kmovd %ecx, %k3
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    shrl $29, %ecx
 ; X32-NEXT:    andb $1, %cl
@@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kxorq %k5, %k0, %k0
 ; X32-NEXT:    kshiftrq $56, %k0, %k5
 ; X32-NEXT:    kxorq %k1, %k5, %k1
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $28, %ecx
-; X32-NEXT:    kmovd %ecx, %k5
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    shrl $30, %ecx
-; X32-NEXT:    kmovd %ecx, %k6
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $7, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $6, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $58, %k0, %k1
-; X32-NEXT:    kxorq %k3, %k1, %k1
+; X32-NEXT:    kxorq %k4, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $5, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $59, %k0, %k1
-; X32-NEXT:    kxorq %k4, %k1, %k1
+; X32-NEXT:    kxorq %k3, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $4, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $60, %k0, %k1
-; X32-NEXT:    kxorq %k5, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $28, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    kshiftlq $63, %k1, %k1
 ; X32-NEXT:    kshiftrq $3, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
@@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
 ; X32-NEXT:    kshiftrq $2, %k1, %k1
 ; X32-NEXT:    kxorq %k0, %k1, %k0
 ; X32-NEXT:    kshiftrq $62, %k0, %k1
-; X32-NEXT:    kxorq %k6, %k1, %k1
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    shrl $30, %ecx
+; X32-NEXT:    kmovd %ecx, %k2
+; X32-NEXT:    kxorq %k2, %k1, %k1
 ; X32-NEXT:    shrl $31, %eax
 ; X32-NEXT:    kmovd %eax, %k2
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index f19e09758f12..13aca464b9e2 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -2,46 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movzwl %di, %eax
-; AVX512BW-NEXT:    shll $16, %esi
-; AVX512BW-NEXT:    orl %esi, %eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    shll $16, %eax
-; AVX512F-32-NEXT:    orl %ecx, %eax
-; AVX512F-32-NEXT:    retl
-  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
-  ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    movl %edi, %eax
-; AVX512BW-NEXT:    shlq $32, %rsi
-; AVX512BW-NEXT:    orq %rsi, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    retl
-  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
-  ret i64 %res
-}
-
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 
   define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 2fa7c2c5b8a8..7b5cc5feff0c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1455,6 +1455,55 @@ define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
   ret  <8 x i64> %res2
 }
 
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    retl
+  %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW:       ## %bb.0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32:       # %bb.0:
+; AVX512F-32-NEXT:    subl $12, %esp
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT:    kmovq %k0, (%esp)
+; AVX512F-32-NEXT:    movl (%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    retl
+  %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+  ret i64 %res
+}
+
 declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
 
 define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
diff --git a/test/CodeGen/X86/clwb.ll b/test/CodeGen/X86/clwb.ll
index 0bbb14917f7f..e5906c6ce68c 100644
--- a/test/CodeGen/X86/clwb.ll
+++ b/test/CodeGen/X86/clwb.ll
@@ -1,5 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: clwb is available in Skylake Server, not available in the newer
+; NOTE: Cannon Lake arch, but available again in the newer Ice Lake arch.
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clwb | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: not llc < %s -mtriple=i686-apple-darwin -mcpu=cannonlake 2>&1 | FileCheck %s --check-prefix=CNL
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=icelake | FileCheck %s
+
+; CNL: LLVM ERROR: Cannot select: intrinsic %llvm.x86.clwb
 
 define void @clwb(i8* %p) nounwind {
 ; CHECK-LABEL: clwb:
diff --git a/test/CodeGen/X86/domain-reassignment.mir b/test/CodeGen/X86/domain-reassignment.mir
index 3cb4b5dd1396..7da9b083c22e 100644
--- a/test/CodeGen/X86/domain-reassignment.mir
+++ b/test/CodeGen/X86/domain-reassignment.mir
@@ -1,22 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
 --- |
   ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
   source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
   target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   target triple = "x86_64-unknown-unknown"
-  
+
   define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
   entry:
     br i1 %cond, label %if, label %else
-  
+
   if:                                               ; preds = %entry
     %cmp1 = fcmp oeq float %f3, %f4
     br label %exit
-  
+
   else:                                             ; preds = %entry
     %cmp2 = fcmp oeq float %f5, %f6
     br label %exit
-  
+
   exit:                                             ; preds = %else, %if
     %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
     %selected = select i1 %val, float %f1, float %f2
@@ -48,14 +49,13 @@
 ...
 ---
 name:            test_fcmp_storefloat
-# CHECK-LABEL: name: test_fcmp_storefloat
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr8, preferred-register: '' }
   - { id: 1, class: gr8, preferred-register: '' }
   - { id: 2, class: gr8, preferred-register: '' }
@@ -79,7 +79,7 @@ registers:
   - { id: 20, class: fr128, preferred-register: '' }
   - { id: 21, class: fr128, preferred-register: '' }
   - { id: 22, class: fr32x, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%edi', virtual-reg: '%3' }
   - { reg: '%rsi', virtual-reg: '%4' }
   - { reg: '%xmm0', virtual-reg: '%5' }
@@ -88,7 +88,7 @@ liveins:
   - { reg: '%xmm3', virtual-reg: '%8' }
   - { reg: '%xmm4', virtual-reg: '%9' }
   - { reg: '%xmm5', virtual-reg: '%10' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -105,14 +105,51 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_fcmp_storefloat
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32x = COPY %xmm5
+  ; CHECK:   [[COPY1:%[0-9]+]]:fr32x = COPY %xmm4
+  ; CHECK:   [[COPY2:%[0-9]+]]:fr32x = COPY %xmm3
+  ; CHECK:   [[COPY3:%[0-9]+]]:fr32x = COPY %xmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:fr32x = COPY %xmm1
+  ; CHECK:   [[COPY5:%[0-9]+]]:vr128x = COPY %xmm0
+  ; CHECK:   [[COPY6:%[0-9]+]]:gr64 = COPY %rsi
+  ; CHECK:   [[COPY7:%[0-9]+]]:gr32 = COPY %edi
+  ; CHECK:   [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
+  ; CHECK:   TEST8ri killed [[COPY8]], 1, implicit-def %eflags
+  ; CHECK:   JE_1 %bb.2, implicit %eflags
+  ; CHECK:   JMP_1 %bb.1
+  ; CHECK: bb.1.if:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY3]], [[COPY2]], 0
+  ; CHECK:   [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
+  ; CHECK:   JMP_1 %bb.3
+  ; CHECK: bb.2.else:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrr1:%[0-9]+]]:vk1 = VCMPSSZrr [[COPY1]], [[COPY]], 0
+  ; CHECK:   [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrr1]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
+  ; CHECK: bb.3.exit:
+  ; CHECK:   [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
+  ; CHECK:   [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
+  ; CHECK:   [[DEF:%[0-9]+]]:fr128 = IMPLICIT_DEF
+  ; CHECK:   [[VMOVSSZrrk:%[0-9]+]]:fr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF]], [[COPY5]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
+  ; CHECK:   VMOVSSZmr [[COPY6]], 1, %noreg, 0, %noreg, killed [[COPY16]] :: (store 4 into %ir.fptr)
+  ; CHECK:   RET 0
   bb.0.entry:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
     liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-  
+
     %10 = COPY %xmm5
     %9 = COPY %xmm4
     %8 = COPY %xmm3
@@ -125,38 +162,31 @@ body:             |
     TEST8ri killed %11, 1, implicit-def %eflags
     JE_1 %bb.2, implicit %eflags
     JMP_1 %bb.1
-  
+
   bb.1.if:
     successors: %bb.3(0x80000000)
-  
+
     %14 = VCMPSSZrr %7, %8, 0
 
     ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %15:vk32 = COPY %14
-    ; CHECK: %0:vk8 = COPY %15
-    
+
     %15 = COPY %14
     %0 = COPY %15.sub_8bit
     JMP_1 %bb.3
-  
+
   bb.2.else:
     successors: %bb.3(0x80000000)
     %12 = VCMPSSZrr %9, %10, 0
 
     ; check that cross domain copies are replaced with same domain copies.
-    ; CHECK: %13:vk32 = COPY %12
-    ; CHECK: %1:vk8 = COPY %13
 
     %13 = COPY %12
     %1 = COPY %13.sub_8bit
-  
+
   bb.3.exit:
 
     ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
-    ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
-    ; CHECK: %16:vk32 = COPY %2
-    ; CHECK: %18:vk1wm = COPY %16
-  
+
     %2 = PHI %1, %bb.2, %0, %bb.1
     %17 = IMPLICIT_DEF
     %16 = INSERT_SUBREG %17, %2, 1
@@ -171,14 +201,13 @@ body:             |
 ...
 ---
 name:            test_8bitops
-# CHECK-LABEL: name: test_8bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -198,13 +227,13 @@ registers:
   - { id: 16, class: gr8, preferred-register: '' }
   - { id: 17, class: gr8, preferred-register: '' }
   - { id: 18, class: gr8, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
   - { reg: '%zmm2', virtual-reg: '%3' }
   - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -221,32 +250,50 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_8bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
+  ; CHECK:   [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
+  ; CHECK:   [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
+  ; CHECK:   [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPDZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPDZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
     %3 = COPY %zmm2
     %4 = COPY %zmm3
-  
+
     %5 = VCMPPDZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk8 = COPY %6
     %6 = COPY %5
     %7 = COPY %6.sub_8bit
 
-    ; CHECK: %12:vk8 = KSHIFTRBri %7, 2
-    ; CHECK: %13:vk8 = KSHIFTLBri %12, 1
-    ; CHECK: %14:vk8 = KNOTBrr %13
-    ; CHECK: %15:vk8 = KORBrr %14, %12
-    ; CHECK: %16:vk8 = KANDBrr %15, %13
-    ; CHECK: %17:vk8 = KXORBrr %16, %12
-    ; CHECK: %18:vk8 = KADDBrr %17, %14
     %12 = SHR8ri %7, 2, implicit-def dead %eflags
     %13 = SHL8ri %12, 1, implicit-def dead %eflags
     %14 = NOT8r %13
@@ -254,19 +301,17 @@ body:             |
     %16 = AND8rr %15, %13, implicit-def dead %eflags
     %17 = XOR8rr %16, %12, implicit-def dead %eflags
     %18 = ADD8rr %17, %14, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %18
-    ; CHECK: %10:vk8wm = COPY %9
+
     %8 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %8, %18, 1
     %10 = COPY %9
     %11 = VMOVAPDZrrk %2, killed %10, %1
-    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
 
-    ; CHECK: KTESTBrr %18, %18, implicit-def %eflags
-    TEST8rr %18, %18, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST8rr %18, %18, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -276,14 +321,13 @@ body:             |
 ...
 ---
 name:            test_16bitops
-# CHECK-LABEL: name: test_16bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -302,13 +346,13 @@ registers:
   - { id: 15, class: gr16, preferred-register: '' }
   - { id: 16, class: gr16, preferred-register: '' }
   - { id: 17, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
   - { reg: '%zmm2', virtual-reg: '%3' }
   - { reg: '%zmm3', virtual-reg: '%4' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -325,50 +369,66 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_16bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY %zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY %zmm3
+  ; CHECK:   [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
+  ; CHECK:   [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
+  ; CHECK:   [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
+  ; CHECK:   [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
     %3 = COPY %zmm2
     %4 = COPY %zmm3
-  
+
     %5 = VCMPPSZrri %3, %4, 0
-    ; CHECK: %6:vk32 = COPY %5
-    ; CHECK: %7:vk16 = COPY %6
     %6 = COPY %5
     %7 = COPY %6.sub_16bit
 
-    ; CHECK: %12:vk16 = KSHIFTRWri %7, 2
-    ; CHECK: %13:vk16 = KSHIFTLWri %12, 1
-    ; CHECK: %14:vk16 = KNOTWrr %13
-    ; CHECK: %15:vk16 = KORWrr %14, %12
-    ; CHECK: %16:vk16 = KANDWrr %15, %13
-    ; CHECK: %17:vk16 = KXORWrr %16, %12
     %12 = SHR16ri %7, 2, implicit-def dead %eflags
     %13 = SHL16ri %12, 1, implicit-def dead %eflags
     %14 = NOT16r %13
     %15 = OR16rr %14, %12, implicit-def dead %eflags
     %16 = AND16rr %15, %13, implicit-def dead %eflags
     %17 = XOR16rr %16, %12, implicit-def dead %eflags
-  
-    ; CHECK: %9:vk32 = COPY %17
-    ; CHECK: %10:vk16wm = COPY %9
+
     %8 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %8, %17, 3
     %10 = COPY %9
     %11 = VMOVAPSZrrk %2, killed %10, %1
-    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11 
+    VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
 
-    ; CHECK: KTESTWrr %17, %17, implicit-def %eflags
-    TEST16rr %17, %17, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST16rr %17, %17, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -378,14 +438,13 @@ body:             |
 ...
 ---
 name:            test_32bitops
-# CHECK-LABEL: name: test_32bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -400,11 +459,11 @@ registers:
   - { id: 11, class: gr32, preferred-register: '' }
   - { id: 12, class: gr32, preferred-register: '' }
   - { id: 13, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -421,26 +480,40 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_32bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
+  ; CHECK:   [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
+  ; CHECK:   [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
+  ; CHECK:   [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
+  ; CHECK:   [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
+  ; CHECK:   [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
+  ; CHECK:   [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = KSHIFTRDri %5, 2
-    ; CHECK: %7:vk32 = KSHIFTLDri %6, 1
-    ; CHECK: %8:vk32 = KNOTDrr %7
-    ; CHECK: %9:vk32 = KORDrr %8, %6
-    ; CHECK: %10:vk32 = KANDDrr %9, %7
-    ; CHECK: %11:vk32 = KXORDrr %10, %6
-    ; CHECK: %12:vk32 = KANDNDrr %11, %9
-    ; CHECK: %13:vk32 = KADDDrr %12, %11
+
     %5 = MOV32rm %0, 1, %noreg, 0, %noreg
     %6 = SHR32ri %5, 2, implicit-def dead %eflags
     %7 = SHL32ri %6, 1, implicit-def dead %eflags
@@ -450,16 +523,15 @@ body:             |
     %11 = XOR32rr %10, %6, implicit-def dead %eflags
     %12 = ANDN32rr %11, %9, implicit-def dead %eflags
     %13 = ADD32rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk32wm = COPY %13
+
     %3 = COPY %13
     %4 = VMOVDQU16Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
 
-    ; CHECK: KTESTDrr %13, %13, implicit-def %eflags
-    TEST32rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST32rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -469,14 +541,13 @@ body:             |
 ...
 ---
 name:            test_64bitops
-# CHECK-LABEL: name: test_64bitops
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -491,11 +562,11 @@ registers:
   - { id: 11, class: gr64, preferred-register: '' }
   - { id: 12, class: gr64, preferred-register: '' }
   - { id: 13, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -512,26 +583,40 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
+  ; CHECK-LABEL: name: test_64bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: %rdi, %zmm0, %zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+  ; CHECK:   [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, %noreg, 0, %noreg
+  ; CHECK:   [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
+  ; CHECK:   [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
+  ; CHECK:   [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
+  ; CHECK:   [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
+  ; CHECK:   [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
+  ; CHECK:   [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+  ; CHECK:   [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = KSHIFTRQri %5, 2
-    ; CHECK: %7:vk64 = KSHIFTLQri %6, 1
-    ; CHECK: %8:vk64 = KNOTQrr %7
-    ; CHECK: %9:vk64 = KORQrr %8, %6
-    ; CHECK: %10:vk64 = KANDQrr %9, %7
-    ; CHECK: %11:vk64 = KXORQrr %10, %6
-    ; CHECK: %12:vk64 = KANDNQrr %11, %9
-    ; CHECK: %13:vk64 = KADDQrr %12, %11
+
     %5 = MOV64rm %0, 1, %noreg, 0, %noreg
     %6 = SHR64ri %5, 2, implicit-def dead %eflags
     %7 = SHL64ri %6, 1, implicit-def dead %eflags
@@ -541,16 +626,15 @@ body:             |
     %11 = XOR64rr %10, %6, implicit-def dead %eflags
     %12 = ANDN64rr %11, %9, implicit-def dead %eflags
     %13 = ADD64rr %12, %11, implicit-def dead %eflags
-  
-    ; CHECK: %3:vk64wm = COPY %13
+
     %3 = COPY %13
     %4 = VMOVDQU8Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
 
-    ; CHECK: KTESTQrr %13, %13, implicit-def %eflags
-    TEST64rr %13, %13, implicit-def %eflags
-    JE_1 %bb.1, implicit %eflags
-    JMP_1 %bb.2
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST64rr %13, %13, implicit-def %eflags
+    ; JE_1 %bb.1, implicit %eflags
+    ; JMP_1 %bb.2
 
   bb.1:
 
@@ -560,14 +644,13 @@ body:             |
 ...
 ---
 name:            test_16bitext
-# CHECK-LABEL: name: test_16bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -575,11 +658,11 @@ registers:
   - { id: 4, class: vr512, preferred-register: '' }
   - { id: 5, class: gr16, preferred-register: '' }
   - { id: 6, class: gr16, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -596,24 +679,32 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_16bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
+    ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
+    ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
+    ; CHECK: VMOVAPSZmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVAPSZrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk16 = COPY %7
-    ; CHECK: %6:vk16 = KNOTWrr %5
+
     %5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
     %6 = NOT16r %5
 
-    ; CHECK: %3:vk16wm = COPY %6
     %3 = COPY %6
     %4 = VMOVAPSZrrk %2, killed %3, %1
     VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -622,14 +713,13 @@ body:             |
 ...
 ---
 name:            test_32bitext
-# CHECK-LABEL: name: test_32bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -638,11 +728,11 @@ registers:
   - { id: 5, class: gr32, preferred-register: '' }
   - { id: 6, class: gr32, preferred-register: '' }
   - { id: 7, class: gr32, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -659,27 +749,35 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_32bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
+    ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU16Zrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk32 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk32 = COPY %9
-    ; CHECK: %7:vk32 = KADDDrr %5, %6
+
     %5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
     %6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
     %7 = ADD32rr %5, %6, implicit-def dead %eflags
 
-    ; CHECK: %3:vk64wm = COPY %7
     %3 = COPY %7
     %4 = VMOVDQU16Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
@@ -688,14 +786,13 @@ body:             |
 ...
 ---
 name:            test_64bitext
-# CHECK-LABEL: name: test_64bitext
 alignment:       4
 exposesReturnsTwice: false
 legalized:       false
 regBankSelected: false
 selected:        false
 tracksRegLiveness: true
-registers:       
+registers:
   - { id: 0, class: gr64, preferred-register: '' }
   - { id: 1, class: vr512, preferred-register: '' }
   - { id: 2, class: vr512, preferred-register: '' }
@@ -704,11 +801,11 @@ registers:
   - { id: 5, class: gr64, preferred-register: '' }
   - { id: 6, class: gr64, preferred-register: '' }
   - { id: 7, class: gr64, preferred-register: '' }
-liveins:         
+liveins:
   - { reg: '%rdi', virtual-reg: '%0' }
   - { reg: '%zmm0', virtual-reg: '%1' }
   - { reg: '%zmm1', virtual-reg: '%2' }
-frameInfo:       
+frameInfo:
   isFrameAddressTaken: false
   isReturnAddressTaken: false
   hasStackMap:     false
@@ -725,27 +822,35 @@ frameInfo:
   hasMustTailInVarArgFunc: false
   savePoint:       ''
   restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
+fixedStack:
+stack:
+constants:
 body:             |
   bb.0:
     liveins: %rdi, %zmm0, %zmm1
-  
+
+    ; CHECK-LABEL: name: test_64bitext
+    ; CHECK: liveins: %rdi, %zmm0, %zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY %zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, %noreg, 0, %noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+    ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, %noreg, 0, %noreg, killed [[VMOVDQU8Zrrk]]
+    ; CHECK: RET 0
     %0 = COPY %rdi
     %1 = COPY %zmm0
     %2 = COPY %zmm1
-  
-    ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %5:vk64 = COPY %8
-    ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
-    ; CHECK: %6:vk64 = COPY %9
-    ; CHECK: %7:vk64 = KADDQrr %5, %6
+
     %5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
     %6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
     %7 = ADD64rr %5, %6, implicit-def dead %eflags
 
-    ; CHECK: %3:vk64wm = COPY %7
     %3 = COPY %7
     %4 = VMOVDQU8Zrrk %2, killed %3, %1
     VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
diff --git a/test/CodeGen/X86/inline-asm-modifier-V.ll b/test/CodeGen/X86/inline-asm-modifier-V.ll
new file mode 100644
index 000000000000..5a7f3fdd25fd
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-modifier-V.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as | FileCheck -check-prefix=X64 %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; X86: call __x86_indirect_thunk_e{{[abcd]}}x
+; X64: call __x86_indirect_thunk_r
+
+define void @q_modifier(i32* %p) {
+entry:
+  tail call void asm sideeffect "call __x86_indirect_thunk_${0:V}", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr36553.ll b/test/CodeGen/X86/pr36553.ll
new file mode 100644
index 000000000000..827f80a3e07e
--- /dev/null
+++ b/test/CodeGen/X86/pr36553.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+; Make sure we don't crash because we negated an fma when we didn't have any fma instructions.
+
+define float @pr36553(float %a, float %b, float %c) nounwind {
+; CHECK-LABEL: pr36553:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq _fmaf
+; CHECK-NEXT:    xorps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %sub = fsub float -0.000000e+00, %0
+  ret float %sub
+}
+
+declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll
index 66d32ba5d73d..2f21bb2566de 100644
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -23,18 +23,18 @@ entry:
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64:       movl %[[x]], %edi
 ; X64:       callq bar
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: icall_reg:
 ; X64FAST:       callq bar
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       callq bar
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: icall_reg:
 ; X86-DAG:   movl 12(%esp), %[[fp:[^ ]*]]
@@ -43,19 +43,19 @@ entry:
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       pushl %[[x]]
 ; X86:       calll bar
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[x]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86-NOT:   # TAILCALL
 
 ; X86FAST-LABEL: icall_reg:
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax
 ; X86FAST:       calll bar
-; X86FAST:       calll __llvm_external_retpoline_eax
+; X86FAST:       calll __x86_indirect_thunk_eax
 
 
 @global_fp = external global void (i32)*
@@ -72,28 +72,28 @@ define void @icall_global_fp(i32 %x, void (i32)** %fpp) #0 {
 ; X64-LABEL: icall_global_fp:
 ; X64-DAG:   movl %edi, %[[x:[^ ]*]]
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movl %[[x]], %edi
 ; X64-DAG:   movq global_fp(%rip), %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: icall_global_fp:
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       callq __llvm_external_retpoline_r11
+; X64FAST:       callq __x86_indirect_thunk_r11
 ; X64FAST:       movq global_fp(%rip), %r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: icall_global_fp:
 ; X86:       movl global_fp, %eax
 ; X86:       pushl 4(%esp)
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl global_fp, %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 ; X86FAST-LABEL: icall_global_fp:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 
 %struct.Foo = type { void (%struct.Foo*)** }
@@ -114,14 +114,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X64:       movq (%[[obj]]), %[[vptr:[^ ]*]]
 ; X64:       movq 8(%[[vptr]]), %[[fp:[^ ]*]]
 ; X64:       movq %[[fp]], %r11
-; X64:       callq __llvm_external_retpoline_r11
+; X64:       callq __x86_indirect_thunk_r11
 ; X64-DAG:   movq %[[obj]], %rdi
 ; X64-DAG:   movq %[[fp]], %r11
-; X64:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X64FAST-LABEL: vcall:
-; X64FAST:       callq __llvm_external_retpoline_r11
-; X64FAST:       jmp __llvm_external_retpoline_r11 # TAILCALL
+; X64FAST:       callq __x86_indirect_thunk_r11
+; X64FAST:       jmp __x86_indirect_thunk_r11 # TAILCALL
 
 ; X86-LABEL: vcall:
 ; X86:       movl 8(%esp), %[[obj:[^ ]*]]
@@ -129,14 +129,14 @@ define void @vcall(%struct.Foo* %obj) #0 {
 ; X86:       movl 4(%[[vptr]]), %[[fp:[^ ]*]]
 ; X86:       movl %[[fp]], %eax
 ; X86:       pushl %[[obj]]
-; X86:       calll __llvm_external_retpoline_eax
+; X86:       calll __x86_indirect_thunk_eax
 ; X86:       addl $4, %esp
 ; X86:       movl %[[fp]], %eax
-; X86:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 ; X86FAST-LABEL: vcall:
-; X86FAST:       calll __llvm_external_retpoline_eax
-; X86FAST:       jmp __llvm_external_retpoline_eax # TAILCALL
+; X86FAST:       calll __x86_indirect_thunk_eax
+; X86FAST:       jmp __x86_indirect_thunk_eax # TAILCALL
 
 
 declare void @direct_callee()
diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll
new file mode 100644
index 000000000000..13b32740b287
--- /dev/null
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+
+; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
+; because there are no available scratch registers.  The Linux kernel builds
+; with -mregparm=3, so we need to support it.  TCO should fail because we need
+; to restore EDI.
+
+define void @call_edi(void (i32, i32, i32)* %fp) #0 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: call_edi:
+;     EDI is used, so it must be saved.
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __llvm_retpoline_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+define void @edi_external(void (i32, i32, i32)* %fp) #1 {
+entry:
+  tail call void %fp(i32 inreg 0, i32 inreg 0, i32 inreg 0)
+  ret void
+}
+
+; CHECK-LABEL: edi_external:
+; CHECK: pushl %edi
+; CHECK-DAG: xorl %eax, %eax
+; CHECK-DAG: xorl %edx, %edx
+; CHECK-DAG: xorl %ecx, %ecx
+; CHECK-DAG: movl {{.*}}, %edi
+; CHECK: calll __x86_indirect_thunk_edi
+; CHECK: popl %edi
+; CHECK: retl
+
+attributes #0 = { "target-features"="+retpoline" }
+attributes #1 = { "target-features"="+retpoline-external-thunk" }
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll
index 57d3388b812a..477609e2d10b 100644
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@@ -340,10 +340,10 @@ latch:
 ; X86-NEXT:          movl    %edx, (%esp)
 ; X86-NEXT:          retl
 ;
-; X86-LABEL:         .section        .text.__llvm_retpoline_push,{{.*}},__llvm_retpoline_push,comdat
-; X86-NEXT:          .hidden __llvm_retpoline_push
-; X86-NEXT:          .weak   __llvm_retpoline_push
-; X86:       __llvm_retpoline_push:
+; X86-LABEL:         .section        .text.__llvm_retpoline_edi,{{.*}},__llvm_retpoline_edi,comdat
+; X86-NEXT:          .hidden __llvm_retpoline_edi
+; X86-NEXT:          .weak   __llvm_retpoline_edi
+; X86:       __llvm_retpoline_edi:
 ; X86-NEXT:  # {{.*}}                                # %entry
 ; X86-NEXT:          calll   [[CALL_TARGET:.*]]
 ; X86-NEXT:  [[CAPTURE_SPEC:.*]]:                    # Block address taken
@@ -355,11 +355,7 @@ latch:
 ; X86-NEXT:          .p2align        4, 0x90
 ; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
 ; X86-NEXT:                                          # %entry
-; X86-NEXT:          addl    $4, %esp
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          pushl   4(%esp)
-; X86-NEXT:          popl    8(%esp)
-; X86-NEXT:          popl    (%esp)
+; X86-NEXT:          movl    %edi, (%esp)
 ; X86-NEXT:          retl
 
 
diff --git a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
index 9954039654bb..e83cf0aa7ffd 100644
--- a/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
+++ b/test/DebugInfo/X86/dbg-value-inlined-parameter.ll
@@ -32,10 +32,10 @@
 ;CHECK-NEXT: DW_AT_call_line
 
 ;CHECK: DW_TAG_formal_parameter
-;FIXME: Linux shouldn't drop this parameter either...
 ;CHECK-NOT: DW_TAG
-;DARWIN:   DW_AT_abstract_origin {{.*}} "sp"
-;DARWIN: DW_TAG_formal_parameter
+;FIXME: Shouldn't drop this parameter...
+;XCHECK:   DW_AT_abstract_origin {{.*}} "sp"
+;XCHECK: DW_TAG_formal_parameter
 ;CHECK: DW_AT_abstract_origin {{.*}} "nums"
 ;CHECK-NOT: DW_TAG_formal_parameter
 
diff --git a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
new file mode 100644
index 000000000000..abc21bc6ac90
--- /dev/null
+++ b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
@@ -0,0 +1,141 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -start-before greedy -stop-after virtregrewriter -o - %s | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "test/DebugInfo/X86/dbg-value-inlined-parameter.ll"
+  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-apple-darwin"
+
+  %struct.S1 = type { float*, i32 }
+
+  @p = common global %struct.S1 zeroinitializer, align 8, !dbg !0
+
+  ; Function Attrs: nounwind optsize ssp
+  define void @foobar() !dbg !15 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.S1* @p, metadata !18, metadata !DIExpression()) , !dbg !25
+    ret void, !dbg !32
+  }
+
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+  !llvm.dbg.cu = !{!2}
+  !llvm.module.flags = !{!14}
+
+  !0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+  !1 = !DIGlobalVariable(name: "p", scope: !2, file: !3, line: 14, type: !6, isLocal: false, isDefinition: true)
+  !2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 2.9 (trunk 125693)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4, globals: !5, imports: !4)
+  !3 = !DIFile(filename: "nm2.c", directory: "/private/tmp")
+  !4 = !{}
+  !5 = !{!0}
+  !6 = !DIDerivedType(tag: DW_TAG_typedef, name: "S1", scope: !2, file: !3, line: 4, baseType: !7)
+  !7 = !DICompositeType(tag: DW_TAG_structure_type, name: "S1", scope: !2, file: !3, line: 1, size: 128, align: 64, elements: !8)
+  !8 = !{!9, !12}
+  !9 = !DIDerivedType(tag: DW_TAG_member, name: "m", scope: !3, file: !3, line: 2, baseType: !10, size: 64, align: 64)
+  !10 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !2, baseType: !11, size: 64, align: 64)
+  !11 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+  !12 = !DIDerivedType(tag: DW_TAG_member, name: "nums", scope: !3, file: !3, line: 3, baseType: !13, size: 32, align: 32, offset: 64)
+  !13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !14 = !{i32 1, !"Debug Info Version", i32 3}
+  !15 = distinct !DISubprogram(name: "foobar", scope: !3, file: !3, line: 15, type: !16, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !2)
+  !16 = !DISubroutineType(types: !17)
+  !17 = !{null}
+  !18 = !DILocalVariable(name: "sp", arg: 1, scope: !19, file: !3, line: 7, type: !24)
+  !19 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 8, type: !20, isLocal: false, isDefinition: true, scopeLine: 8, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !22)
+  !20 = !DISubroutineType(types: !21)
+  !21 = !{!13}
+  !22 = !{!18, !23}
+  !23 = !DILocalVariable(name: "nums", arg: 2, scope: !19, file: !3, line: 7, type: !13)
+  !24 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !2, baseType: !6, size: 64, align: 64)
+  !25 = !DILocation(line: 7, column: 13, scope: !19, inlinedAt: !26)
+  !26 = !DILocation(line: 16, column: 3, scope: !27)
+  !27 = distinct !DILexicalBlock(scope: !15, file: !3, line: 15, column: 15)
+  !32 = !DILocation(line: 17, column: 1, scope: !27)
+
+...
+---
+name:            foobar
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    %1:gr64 = IMPLICIT_DEF
+    %2:gr64 = IMPLICIT_DEF
+
+  bb.1:
+    ; This DBG_VALUE will be discarded (use before def of %0).
+    DBG_VALUE debug-use %0, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    %0:gr64 = IMPLICIT_DEF
+    %0:gr64 = IMPLICIT_DEF
+    %0:gr64 = IMPLICIT_DEF
+    %0:gr64 = IMPLICIT_DEF
+
+  bb.2:
+    ; This DBG_VALUE will be discarded (%1 is defined earlier, but it is not live in, so we do not know where %1 is stored).
+    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    %1:gr64 = IMPLICIT_DEF
+    %1:gr64 = IMPLICIT_DEF
+    %1:gr64 = IMPLICIT_DEF
+    %1:gr64 = IMPLICIT_DEF
+    ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev instruction,
+    ; so the value should be available for as long as the register allocated to %1 is live.
+    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+
+  bb.3:
+    %1:gr64 = IMPLICIT_DEF
+    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
+    ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev non-dbg instruction,
+    ; so the value should be available for as long as the register allocated to %1 is live.
+    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+
+  bb.4:
+    ; All DBG_VALUEs here should survive. %2 is livein as it was defined in bb.0, and it has use/def in the BTS64rr instruction.
+    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
+    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
+    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
+    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
+    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+
+  bb.5:
+    RET 0, debug-location !32
+...
+
+# CHECK-LABEL: name: foobar
+
+# CHECK-LABEL: bb.1:
+## After solving https://bugs.llvm.org/show_bug.cgi?id=36579 we expect to get a
+##   DBG_VALUE debug-use $noreg
+## here.
+# CHECK-NOT:    DBG_VALUE
+
+# CHECK-LABEL: bb.2:
+## After solving https://bugs.llvm.org/show_bug.cgi?id=36579 we expect to get a
+##   DBG_VALUE debug-use $noreg
+## here.
+# CHECK-NOT:    DBG_VALUE
+# CHECK:        dead renamable $rcx = IMPLICIT_DEF
+# CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
+# CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
+# CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
+# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+
+# CHECK-LABEL: bb.3:
+# CHECK:        dead renamable $rcx = IMPLICIT_DEF
+# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+
+# CHECK-LABEL: bb.4:
+# CHECK:        liveins: $rax
+# CHECK:        DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
+# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
+# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   dead renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
+
+# CHECK-LABEL: bb.5:
+# CHECK-NEXT:   RET 0
diff --git a/test/MC/AsmParser/inline_macro_duplication.ll b/test/MC/AsmParser/inline_macro_duplication.ll
new file mode 100644
index 000000000000..9d7e22fde7b6
--- /dev/null
+++ b/test/MC/AsmParser/inline_macro_duplication.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s 2>&1 | FileCheck %s
+
+define void @test() {
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+  call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
+; CHECK: error: macro 'FOO' is already defined
+  ret void
+}
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 378af768fa99..01cd6b6fa006 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -622,6 +622,11 @@ movl	$12, foo(%rip)
 // CHECK: encoding: [0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
 // CHECK:    fixup A - offset: 2, value: foo-8, kind: reloc_riprel_4byte
 
+// rdar://37247000
+movl	$12, 1024(%rip)
+// CHECK: movl	$12, 1024(%rip)
+// CHECK: encoding: [0xc7,0x05,0x00,0x04,0x00,0x00,0x0c,0x00,0x00,0x00]
+
 movq	$12, foo(%rip)
 // CHECK:  movq	$12, foo(%rip)
 // CHECK: encoding: [0x48,0xc7,0x05,A,A,A,A,0x0c,0x00,0x00,0x00]
diff --git a/test/Transforms/InstCombine/pr36362.ll b/test/Transforms/InstCombine/pr36362.ll
new file mode 100644
index 000000000000..412691543a15
--- /dev/null
+++ b/test/Transforms/InstCombine/pr36362.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt -instcombine -S %s | FileCheck %s
+
+; We shouldn't remove the select before the srem
+define i32 @foo(i1 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[A:%.*]], i32 [[B:%.*]], i32 -1
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[C:%.*]], [[SEL1]]
+; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[A]], i32 [[REM]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL2]]
+;
+  %sel1 = select i1 %a, i32 %b, i32 -1
+  %rem = srem i32 %c, %sel1
+  %sel2 = select i1 %a, i32 %rem, i32 0
+  ret i32 %sel2
+}
+
diff --git a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
index d9c9632be047..08d163fe6299 100644
--- a/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
+++ b/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -scev-version-unknown < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/LoopVectorize/pr35773.ll b/test/Transforms/LoopVectorize/pr35773.ll
index 362ece70b898..308bb393cc4e 100644
--- a/test/Transforms/LoopVectorize/pr35773.ll
+++ b/test/Transforms/LoopVectorize/pr35773.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -scev-version-unknown < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @a = common local_unnamed_addr global i32 0, align 4
 @b = common local_unnamed_addr global i8 0, align 1
diff --git a/test/Transforms/LoopVectorize/reduction-small-size.ll b/test/Transforms/LoopVectorize/reduction-small-size.ll
index b44beb8ce68f..879f1c3c5ad4 100644
--- a/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT:    [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    br i1 {{.*}}, label %middle.block, label %vector.body
 ;
-define void @PR34687(i1 %c, i32 %x, i32 %n) {
+define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 entry:
   br label %for.body
 
@@ -36,5 +36,38 @@ if.end:
 
 for.end:
   %tmp2 = phi i32 [ %r.next, %if.end ]
-  ret void
+  %tmp3 = trunc i32 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; CHECK-LABEL: @PR35734(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 %y, i32 0
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], %vector.ph ], [ [[TMP9:%.*]], %vector.body ]
+; CHECK:         [[TMP5:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK:         [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i1>
+; CHECK-NEXT:    [[TMP9]] = sext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR35734(i32 %x, i32 %y) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ %x, %entry ], [ %i.next, %for.body ]
+  %r = phi i32 [ %y, %entry ], [ %r.next, %for.body ]
+  %tmp0 = and i32 %r, 1
+  %r.next = add i32 %tmp0, -1
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp sgt i32 %i, 77
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp1 = phi i32 [ %r.next, %for.body ]
+  ret i32 %tmp1
 }
diff --git a/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll b/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
index 4ddc6a652179..f7877245b0d4 100644
--- a/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
+++ b/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s -check-prefix=VF8
-; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix=VF1
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -scev-version-unknown < %s | FileCheck %s -check-prefix=VF8
+; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -scev-version-unknown < %s | FileCheck %s -check-prefix=VF1
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/test/tools/llvm-config/system-libs.windows.test b/test/tools/llvm-config/system-libs.windows.test
index 2c6e03afa2d9..09970cf68994 100644
--- a/test/tools/llvm-config/system-libs.windows.test
+++ b/test/tools/llvm-config/system-libs.windows.test
@@ -2,6 +2,6 @@ RUN: llvm-config --link-static --system-libs 2>&1 | FileCheck %s
 REQUIRES: static-libs
 REQUIRES: system-windows
 CHECK-NOT: -l
-CHECK: psapi.lib shell32.lib ole32.lib uuid.lib
+CHECK: psapi.lib shell32.lib ole32.lib uuid.lib advapi32.lib
 CHECK-NOT: error
 CHECK-NOT: warning